[FFmpeg-devel] [PATCH 1/2] avfilter/transpose: refactor for asm

Thu Sep 12 20:08:38 CEST 2013

On Thu, Sep 12, 2013 at 04:52:54PM +0000, Paul B Mahol wrote:
> Signed-off-by: Paul B Mahol <onemda at gmail.com>
> ---
>  libavfilter/vf_transpose.c | 124 ++++++++++++++++++++++++++++++---------------
>  1 file changed, 82 insertions(+), 42 deletions(-)
> 
> diff --git a/libavfilter/vf_transpose.c b/libavfilter/vf_transpose.c
> index 8daeeaf..d19198c 100644
> --- a/libavfilter/vf_transpose.c
> +++ b/libavfilter/vf_transpose.c
> @@ -58,6 +58,9 @@ typedef struct {
>  
>      PassthroughType passthrough; ///< landscape passthrough mode enabled
>      enum TransposeDir dir;
> +
> +    void (*transpose_block)(uint8_t *src, int src_linesize,
> +                            uint8_t *dst, int dst_linesize);
>  } TransContext;
>  
>  static int query_formats(AVFilterContext *ctx)
> @@ -79,6 +82,67 @@ static int query_formats(AVFilterContext *ctx)
>      return 0;
>  }
>  
> +static void transpose_8_c(uint8_t *src, int src_linesize,
> +                          uint8_t *dst, int dst_linesize)
> +{
> +    int x, y;
> +    for (y = 0; y < 8; y++, dst += dst_linesize, src++)
> +        for (x = 0; x < 8; x++)
> +            dst[x] = src[x*src_linesize];
> +}
> +
> +static void transpose_16_c(uint8_t *src, int src_linesize,
> +                           uint8_t *dst, int dst_linesize)
> +{
> +    int x, y;
> +    for (y = 0; y < 8; y++, dst += dst_linesize, src += 2)
> +        for (x = 0; x < 8; x++)
> +            *((uint16_t *)(dst + 2*x)) = *((uint16_t *)(src + x*src_linesize));
> +}
> +
> +static void transpose_24_c(uint8_t *src, int src_linesize,
> +                           uint8_t *dst, int dst_linesize)
> +{
> +    int x, y;
> +    for (y = 0; y < 8; y++, dst += dst_linesize) {
> +        for (x = 0; x < 8; x++) {
> +            int32_t v = AV_RB24(src + x*src_linesize + y*3);
> +            AV_WB24(dst + 3*x, v);
> +        }
> +    }
> +}
> +
> +static void transpose_32_c(uint8_t *src, int src_linesize,
> +                           uint8_t *dst, int dst_linesize)
> +{
> +    int x, y;
> +    for (y = 0; y < 8; y++, dst += dst_linesize, src += 4) {
> +        for (x = 0; x < 8; x++)
> +            *((uint32_t *)(dst + 4*x)) = *((uint32_t *)(src + x*src_linesize));
> +    }
> +}
> +
> +static void transpose_48_c(uint8_t *src, int src_linesize,
> +                           uint8_t *dst, int dst_linesize)
> +{
> +    int x, y;
> +    for (y = 0; y < 8; y++, dst += dst_linesize, src += 6) {
> +        for (x = 0; x < 8; x++) {
> +            int64_t v = AV_RB48(src + x*src_linesize);
> +            AV_WB48(dst + 6*x, v);
> +        }
> +    }
> +}
> +
> +static void transpose_64_c(uint8_t *src, int src_linesize,
> +                           uint8_t *dst, int dst_linesize)
> +{
> +    int x, y;
> +    for (y = 0; y < 8; y++, dst += dst_linesize, src += 8)
> +        for (x = 0; x < 8; x++)
> +            *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x*src_linesize));
> +}
> +
>  static int config_props_output(AVFilterLink *outlink)
>  {
>      AVFilterContext *ctx = outlink->src;
> @@ -117,6 +181,15 @@ static int config_props_output(AVFilterLink *outlink)
>      } else
>          outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
>  
> +    switch (trans->pixsteps[0]) {
> +    case 1: trans->transpose_block = transpose_8_c;  break;
> +    case 2: trans->transpose_block = transpose_16_c; break;
> +    case 3: trans->transpose_block = transpose_24_c; break;
> +    case 4: trans->transpose_block = transpose_32_c; break;
> +    case 6: trans->transpose_block = transpose_48_c; break;
> +    case 8: trans->transpose_block = transpose_64_c; break;
> +    }
> +
>      av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
>             inlink->w, inlink->h, trans->dir, outlink->w, outlink->h,
>             trans->dir == 1 || trans->dir == 3 ? "clockwise" : "counterclockwise",
> @@ -174,47 +247,12 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr,
>              dstlinesize *= -1;
>          }
>  
> -        switch (pixstep) {
> -        case 1:
> -            for (y = start; y < end; y++, dst += dstlinesize)
> -                for (x = 0; x < outw; x++)
> -                    dst[x] = src[x*srclinesize + y];
> -            break;
> -        case 2:
> -            for (y = start; y < end; y++, dst += dstlinesize) {
> -                for (x = 0; x < outw; x++)
> -                    *((uint16_t *)(dst + 2*x)) = *((uint16_t *)(src + x*srclinesize + y*2));
> -            }
> -            break;
> -        case 3:
> -            for (y = start; y < end; y++, dst += dstlinesize) {
> -                for (x = 0; x < outw; x++) {
> -                    int32_t v = AV_RB24(src + x*srclinesize + y*3);
> -                    AV_WB24(dst + 3*x, v);
> -                }
> -            }
> -            break;
> -        case 4:
> -            for (y = start; y < end; y++, dst += dstlinesize) {
> -                for (x = 0; x < outw; x++)
> -                    *((uint32_t *)(dst + 4*x)) = *((uint32_t *)(src + x*srclinesize + y*4));
> -            }
> -            break;
> -        case 6:
> -            for (y = start; y < end; y++, dst += dstlinesize) {
> -                for (x = 0; x < outw; x++) {
> -                    int64_t v = AV_RB48(src + x*srclinesize + y*6);
> -                    AV_WB48(dst + 6*x, v);
> -                }
> -            }
> -            break;
> -        case 8:
> -            for (y = start; y < end; y++, dst += dstlinesize) {
> -                for (x = 0; x < outw; x++)
> -                    *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x*srclinesize + y*8));
> -            }
> -            break;
> -        }

> +        for (y = start; y < end; y += 8)
> +            for (x = 0; x < outw; x += 8)
> +                trans->transpose_block(src + x * srclinesize + y * pixstep,
> +                                       srclinesize,
> +                                       dst + (y - start) * dstlinesize + x * pixstep,
> +                                       dstlinesize);

raster scan order does still look like a bad idea to me
also one call per 64 pixels also has its overhead


[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

When you are offended at any man's fault, turn to yourself and study your
own failings. Then you will forget your anger. -- Epictetus
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130912/1a460e32/attachment.asc>