[FFmpeg-devel] [PATCH] lavfi: Port fspp to FFmpeg

Mon Dec 15 02:23:06 CET 2014

On Sun, Dec 14, 2014 at 12:26:15PM +0530, arwa arif wrote:
> I have tried to port fspp. Not sure, if it is correct or not.

[...]

> +static void filter(FSPPContext *p , uint8_t *dst , uint8_t *src,
> +                   int dst_stride , int src_stride ,
> +                   int width , int height ,
> +                   uint8_t *qp_store , int qp_stride , int is_luma) {
> +
> +    int x, x0, y, es, qy, t;
> +    const int stride = is_luma ? p->temp_stride : (width+16); //((width+16+15)&(~15))
> +    const int step = 6 - p->log2_count;
> +    const int qps = 3 + is_luma;
> +    DECLARE_ALIGNED(32 , int32_t , block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
> +    int16_t *block = (int16_t *)block_align;
> +    int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
> +
> +    memset(block3 , 0 , 4 * 8 * BLOCKSZ);
> +
> +    //p->src=src-src_stride*8-8;//!
> +    if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
> +    for (y = 0 ; y < height ; y++) {
> +        int index = 8 + 8*stride + y*stride;
> +        memcpy(p->src + index , src + y*src_stride , width);//this line can be avoided by using DR & user fr.buffers
> +        for (x = 0 ; x < 8 ; x++) {
> +            p->src[index         - x - 1]= p->src[index +         x    ];
> +            p->src[index + width + x    ]= p->src[index + width - x - 1];
> +        }
> +    }
> +    for (y = 0 ; y < 8 ; y++) {
> +        memcpy(p->src + (     7 - y    ) * stride , p->src + (    y + 8    ) * stride , stride);
> +        memcpy(p->src + (height + 8 + y) * stride , p->src + (height -y + 7) * stride , stride);
> +    }
> +    //FIXME (try edge emu)
> +
> +    for (y = 8 ; y < 24 ; y++)
> +        memset(p->temp + 8 + y * stride , 0 , width * sizeof(int16_t));
> +
> +    for (y = step ; y < height + 8 ; y += step) {    //step= 1,2
> +        qy = y - 4;
> +        if (qy > height - 1) qy = height - 1;
> +        if (qy < 0) qy = 0;
> +        qy = (qy >> qps) * qp_stride;
> +        row_fdct_s(block , p->src + y * stride + 2 - (y&1) , stride , 2);
> +        for (x0 = 0 ; x0 < width + 8 - 8 * (BLOCKSZ - 1) ; x0 += 8 * (BLOCKSZ - 1)) {
> +            row_fdct_s(block + 8 * 8 , p->src + y * stride + 8 + x0 + 2 - (y&1) , stride , 2 * (BLOCKSZ - 1));
> +            if(p->qp)
> +                column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block + 0 * 8 , block3 + 0 * 8 , 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
> +            else
> +                for (x = 0 ; x < 8 * (BLOCKSZ - 1) ; x += 8) {
> +                    t = x + x0 -2; //correct t=x+x0-2-(y&1), but its the same
> +                    if (t<0) t = 0;//t always < width-2
> +                    t = qp_store[qy+(t >> qps)];
> +                    t = norm_qscale(t, p->qscale_type);
> +                    if (t != p->prev_q) p->prev_q = t, mul_thrmat_s(p, t);
> +                    column_fidct_s((int16_t*)(&p->threshold_mtx[0]) , block + x * 8 , block3 + x * 8 , 8); //yes, this is a HOTSPOT
> +                }
> +            row_idct_s(block3 + 0*8 , p->temp + (y&15) * stride + x0 + 2 - (y&1) , stride , 2 * (BLOCKSZ - 1));
> +            memmove(block, block + (BLOCKSZ - 1) * 64 , 8 * 8 * sizeof(int16_t)); //cycling
> +            memmove(block3, block3 + (BLOCKSZ - 1) * 64 , 6 * 8 * sizeof(int16_t));
> +        }
> +        //
> +        es = width + 8 - x0; //  8, ...
> +        if (es > 8)
> +            row_fdct_s(block + 8 * 8 , p->src + y * stride + 8 + x0 + 2 - (y&1) , stride , (es - 4) >> 2);
> +        column_fidct_s((int16_t*)(&p->threshold_mtx[0]) , block , block3 , es&(~1));

> +        row_idct_s(block3 + 0 * 8 , p->temp + (y&15) * stride + x0 + 2 - (y&1) , stride , es >> 2);
> +        const int y1 = y - 8 + step;//l5-7  l4-6

this mixes declaration and statents, some compilers have problems
with that


[...]
> +static void mul_thrmat_mmx(FSPPContext *p, int q) {
> +    uint64_t *adr = &p->threshold_mtx_noq[0];
> +    __asm__ volatile(
> +        "movd %0, %%mm7                \n\t"
> +        "add $8*8*2, %%"REG_D"            \n\t"
> +        "movq 0*8(%%"REG_S"), %%mm0        \n\t"
> +        "punpcklwd %%mm7, %%mm7        \n\t"
> +        "movq 1*8(%%"REG_S"), %%mm1        \n\t"
> +        "punpckldq %%mm7, %%mm7        \n\t"
> +        "pmullw %%mm7, %%mm0           \n\t"
> +
> +        "movq 2*8(%%"REG_S"), %%mm2        \n\t"
> +        "pmullw %%mm7, %%mm1           \n\t"
> +
> +        "movq 3*8(%%"REG_S"), %%mm3        \n\t"
> +        "pmullw %%mm7, %%mm2           \n\t"
> +
> +        "movq %%mm0, 0*8(%%"REG_D")        \n\t"
> +        "movq 4*8(%%"REG_S"), %%mm4        \n\t"
> +        "pmullw %%mm7, %%mm3           \n\t"
> +
> +        "movq %%mm1, 1*8(%%"REG_D")        \n\t"
> +        "movq 5*8(%%"REG_S"), %%mm5        \n\t"
> +        "pmullw %%mm7, %%mm4           \n\t"
> +
> +        "movq %%mm2, 2*8(%%"REG_D")        \n\t"
> +        "movq 6*8(%%"REG_S"), %%mm6        \n\t"
> +        "pmullw %%mm7, %%mm5           \n\t"
> +
> +        "movq %%mm3, 3*8(%%"REG_D")        \n\t"
> +        "movq 7*8+0*8(%%"REG_S"), %%mm0    \n\t"
> +        "pmullw %%mm7, %%mm6           \n\t"
> +
> +        "movq %%mm4, 4*8(%%"REG_D")        \n\t"
> +        "movq 7*8+1*8(%%"REG_S"), %%mm1    \n\t"
> +        "pmullw %%mm7, %%mm0           \n\t"
> +
> +        "movq %%mm5, 5*8(%%"REG_D")        \n\t"
> +        "movq 7*8+2*8(%%"REG_S"), %%mm2    \n\t"
> +        "pmullw %%mm7, %%mm1           \n\t"
> +
> +        "movq %%mm6, 6*8(%%"REG_D")        \n\t"
> +        "movq 7*8+3*8(%%"REG_S"), %%mm3    \n\t"
> +        "pmullw %%mm7, %%mm2           \n\t"
> +
> +        "movq %%mm0, 7*8+0*8(%%"REG_D")    \n\t"
> +        "movq 7*8+4*8(%%"REG_S"), %%mm4    \n\t"
> +        "pmullw %%mm7, %%mm3           \n\t"
> +
> +        "movq %%mm1, 7*8+1*8(%%"REG_D")    \n\t"
> +        "movq 7*8+5*8(%%"REG_S"), %%mm5    \n\t"
> +        "pmullw %%mm7, %%mm4           \n\t"
> +
> +        "movq %%mm2, 7*8+2*8(%%"REG_D")    \n\t"
> +        "movq 7*8+6*8(%%"REG_S"), %%mm6    \n\t"
> +        "pmullw %%mm7, %%mm5           \n\t"
> +
> +        "movq %%mm3, 7*8+3*8(%%"REG_D")    \n\t"
> +        "movq 14*8+0*8(%%"REG_S"), %%mm0   \n\t"
> +        "pmullw %%mm7, %%mm6           \n\t"
> +
> +        "movq %%mm4, 7*8+4*8(%%"REG_D")    \n\t"
> +        "movq 14*8+1*8(%%"REG_S"), %%mm1   \n\t"
> +        "pmullw %%mm7, %%mm0           \n\t"
> +
> +        "movq %%mm5, 7*8+5*8(%%"REG_D")    \n\t"
> +        "pmullw %%mm7, %%mm1           \n\t"
> +
> +        "movq %%mm6, 7*8+6*8(%%"REG_D")    \n\t"
> +        "movq %%mm0, 14*8+0*8(%%"REG_D")   \n\t"
> +        "movq %%mm1, 14*8+1*8(%%"REG_D")   \n\t"
> +
> +        : "+g" (q), "+S" (adr), "+D" (adr)
> +        :
> +        );
> +}
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433)   = FIX64(0.382683433, 14);

> +DECLARE_ALIGNED  (8, uint64_t, ff_MM_FIX_0_541196100)= FIX64(0.541196100, 14);
> +DECLARE_ALIGNED  (8, uint64_t, ff_MM_FIX_0_707106781)= FIX64(0.707106781, 14);

these 2 conflict with the existing fspp filter, they should be removed
from one to avoid that conflict


> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965)   = FIX64(1.306562965, 14);
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A) = FIX64(1.414213562, 14);
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065)   = FIX64(1.847759065, 13);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930)   = FIX64(-2.613125930, 13); //-
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562)   = FIX64(1.414213562, 13);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200)   = FIX64(1.082392200, 13);
> +//for t3,t5,t7 == 0 shortcut
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065)   = FIX64(0.847759065, 14);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497)   = FIX64(0.566454497, 14);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367)   = FIX64(0.198912367, 14);
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND)       = C64(4);
> +DECLARE_ASM_CONST(8, uint64_t, MM_2)                 = C64(2);
> +
> +static void column_fidct_mmx(int16_t* thr_adr,  int16_t *data,  int16_t *output,  int cnt)
> +{
> +    DECLARE_ALIGNED(8, uint64_t, temps)[4];
> +    __asm__ volatile(

> +        ASMALIGN(4)

this fails to build, you can remove that line


[...]

-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

I know you won't believe me, but the highest form of Human Excellence is
to question oneself and others. -- Socrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20141215/fb78788f/attachment.asc>