[FFmpeg-devel] [PATCH] lavfi: Port fspp to FFmpeg
Michael Niedermayer
michaelni at gmx.at
Mon Dec 15 02:23:06 CET 2014
On Sun, Dec 14, 2014 at 12:26:15PM +0530, arwa arif wrote:
> I have tried to port fspp. Not sure, if it is correct or not.
[...]
> +static void filter(FSPPContext *p , uint8_t *dst , uint8_t *src,
> + int dst_stride , int src_stride ,
> + int width , int height ,
> + uint8_t *qp_store , int qp_stride , int is_luma) {
> +
> + int x, x0, y, es, qy, t;
> + const int stride = is_luma ? p->temp_stride : (width+16); //((width+16+15)&(~15))
> + const int step = 6 - p->log2_count;
> + const int qps = 3 + is_luma;
> + DECLARE_ALIGNED(32 , int32_t , block_align)[4 * 8 * BLOCKSZ + 4 * 8 * BLOCKSZ];
> + int16_t *block = (int16_t *)block_align;
> + int16_t *block3 = (int16_t *)(block_align + 4 * 8 * BLOCKSZ);
> +
> + memset(block3 , 0 , 4 * 8 * BLOCKSZ);
> +
> + //p->src=src-src_stride*8-8;//!
> + if (!src || !dst) return; // HACK avoid crash for Y8 colourspace
> + for (y = 0 ; y < height ; y++) {
> + int index = 8 + 8*stride + y*stride;
> + memcpy(p->src + index , src + y*src_stride , width);//this line can be avoided by using DR & user fr.buffers
> + for (x = 0 ; x < 8 ; x++) {
> + p->src[index - x - 1]= p->src[index + x ];
> + p->src[index + width + x ]= p->src[index + width - x - 1];
> + }
> + }
> + for (y = 0 ; y < 8 ; y++) {
> + memcpy(p->src + ( 7 - y ) * stride , p->src + ( y + 8 ) * stride , stride);
> + memcpy(p->src + (height + 8 + y) * stride , p->src + (height -y + 7) * stride , stride);
> + }
> + //FIXME (try edge emu)
> +
> + for (y = 8 ; y < 24 ; y++)
> + memset(p->temp + 8 + y * stride , 0 , width * sizeof(int16_t));
> +
> + for (y = step ; y < height + 8 ; y += step) { //step= 1,2
> + qy = y - 4;
> + if (qy > height - 1) qy = height - 1;
> + if (qy < 0) qy = 0;
> + qy = (qy >> qps) * qp_stride;
> + row_fdct_s(block , p->src + y * stride + 2 - (y&1) , stride , 2);
> + for (x0 = 0 ; x0 < width + 8 - 8 * (BLOCKSZ - 1) ; x0 += 8 * (BLOCKSZ - 1)) {
> + row_fdct_s(block + 8 * 8 , p->src + y * stride + 8 + x0 + 2 - (y&1) , stride , 2 * (BLOCKSZ - 1));
> + if(p->qp)
> + column_fidct_s((int16_t*)(&p->threshold_mtx[0]), block + 0 * 8 , block3 + 0 * 8 , 8 * (BLOCKSZ - 1)); //yes, this is a HOTSPOT
> + else
> + for (x = 0 ; x < 8 * (BLOCKSZ - 1) ; x += 8) {
> + t = x + x0 -2; //correct t=x+x0-2-(y&1), but its the same
> + if (t<0) t = 0;//t always < width-2
> + t = qp_store[qy+(t >> qps)];
> + t = norm_qscale(t, p->qscale_type);
> + if (t != p->prev_q) p->prev_q = t, mul_thrmat_s(p, t);
> + column_fidct_s((int16_t*)(&p->threshold_mtx[0]) , block + x * 8 , block3 + x * 8 , 8); //yes, this is a HOTSPOT
> + }
> + row_idct_s(block3 + 0*8 , p->temp + (y&15) * stride + x0 + 2 - (y&1) , stride , 2 * (BLOCKSZ - 1));
> + memmove(block, block + (BLOCKSZ - 1) * 64 , 8 * 8 * sizeof(int16_t)); //cycling
> + memmove(block3, block3 + (BLOCKSZ - 1) * 64 , 6 * 8 * sizeof(int16_t));
> + }
> + //
> + es = width + 8 - x0; // 8, ...
> + if (es > 8)
> + row_fdct_s(block + 8 * 8 , p->src + y * stride + 8 + x0 + 2 - (y&1) , stride , (es - 4) >> 2);
> + column_fidct_s((int16_t*)(&p->threshold_mtx[0]) , block , block3 , es&(~1));
> + row_idct_s(block3 + 0 * 8 , p->temp + (y&15) * stride + x0 + 2 - (y&1) , stride , es >> 2);
> + const int y1 = y - 8 + step;//l5-7 l4-6
this mixes declaration and statents, some compilers have problems
with that
[...]
> +static void mul_thrmat_mmx(FSPPContext *p, int q) {
> + uint64_t *adr = &p->threshold_mtx_noq[0];
> + __asm__ volatile(
> + "movd %0, %%mm7 \n\t"
> + "add $8*8*2, %%"REG_D" \n\t"
> + "movq 0*8(%%"REG_S"), %%mm0 \n\t"
> + "punpcklwd %%mm7, %%mm7 \n\t"
> + "movq 1*8(%%"REG_S"), %%mm1 \n\t"
> + "punpckldq %%mm7, %%mm7 \n\t"
> + "pmullw %%mm7, %%mm0 \n\t"
> +
> + "movq 2*8(%%"REG_S"), %%mm2 \n\t"
> + "pmullw %%mm7, %%mm1 \n\t"
> +
> + "movq 3*8(%%"REG_S"), %%mm3 \n\t"
> + "pmullw %%mm7, %%mm2 \n\t"
> +
> + "movq %%mm0, 0*8(%%"REG_D") \n\t"
> + "movq 4*8(%%"REG_S"), %%mm4 \n\t"
> + "pmullw %%mm7, %%mm3 \n\t"
> +
> + "movq %%mm1, 1*8(%%"REG_D") \n\t"
> + "movq 5*8(%%"REG_S"), %%mm5 \n\t"
> + "pmullw %%mm7, %%mm4 \n\t"
> +
> + "movq %%mm2, 2*8(%%"REG_D") \n\t"
> + "movq 6*8(%%"REG_S"), %%mm6 \n\t"
> + "pmullw %%mm7, %%mm5 \n\t"
> +
> + "movq %%mm3, 3*8(%%"REG_D") \n\t"
> + "movq 7*8+0*8(%%"REG_S"), %%mm0 \n\t"
> + "pmullw %%mm7, %%mm6 \n\t"
> +
> + "movq %%mm4, 4*8(%%"REG_D") \n\t"
> + "movq 7*8+1*8(%%"REG_S"), %%mm1 \n\t"
> + "pmullw %%mm7, %%mm0 \n\t"
> +
> + "movq %%mm5, 5*8(%%"REG_D") \n\t"
> + "movq 7*8+2*8(%%"REG_S"), %%mm2 \n\t"
> + "pmullw %%mm7, %%mm1 \n\t"
> +
> + "movq %%mm6, 6*8(%%"REG_D") \n\t"
> + "movq 7*8+3*8(%%"REG_S"), %%mm3 \n\t"
> + "pmullw %%mm7, %%mm2 \n\t"
> +
> + "movq %%mm0, 7*8+0*8(%%"REG_D") \n\t"
> + "movq 7*8+4*8(%%"REG_S"), %%mm4 \n\t"
> + "pmullw %%mm7, %%mm3 \n\t"
> +
> + "movq %%mm1, 7*8+1*8(%%"REG_D") \n\t"
> + "movq 7*8+5*8(%%"REG_S"), %%mm5 \n\t"
> + "pmullw %%mm7, %%mm4 \n\t"
> +
> + "movq %%mm2, 7*8+2*8(%%"REG_D") \n\t"
> + "movq 7*8+6*8(%%"REG_S"), %%mm6 \n\t"
> + "pmullw %%mm7, %%mm5 \n\t"
> +
> + "movq %%mm3, 7*8+3*8(%%"REG_D") \n\t"
> + "movq 14*8+0*8(%%"REG_S"), %%mm0 \n\t"
> + "pmullw %%mm7, %%mm6 \n\t"
> +
> + "movq %%mm4, 7*8+4*8(%%"REG_D") \n\t"
> + "movq 14*8+1*8(%%"REG_S"), %%mm1 \n\t"
> + "pmullw %%mm7, %%mm0 \n\t"
> +
> + "movq %%mm5, 7*8+5*8(%%"REG_D") \n\t"
> + "pmullw %%mm7, %%mm1 \n\t"
> +
> + "movq %%mm6, 7*8+6*8(%%"REG_D") \n\t"
> + "movq %%mm0, 14*8+0*8(%%"REG_D") \n\t"
> + "movq %%mm1, 14*8+1*8(%%"REG_D") \n\t"
> +
> + : "+g" (q), "+S" (adr), "+D" (adr)
> + :
> + );
> +}
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_382683433) = FIX64(0.382683433, 14);
> +DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_541196100)= FIX64(0.541196100, 14);
> +DECLARE_ALIGNED (8, uint64_t, ff_MM_FIX_0_707106781)= FIX64(0.707106781, 14);
these 2 conflict with the existing fspp filter, they should be removed
from one to avoid that conflict
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_306562965) = FIX64(1.306562965, 14);
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562_A) = FIX64(1.414213562, 14);
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_847759065) = FIX64(1.847759065, 13);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_2_613125930) = FIX64(-2.613125930, 13); //-
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_414213562) = FIX64(1.414213562, 13);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_1_082392200) = FIX64(1.082392200, 13);
> +//for t3,t5,t7 == 0 shortcut
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_847759065) = FIX64(0.847759065, 14);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_566454497) = FIX64(0.566454497, 14);
> +DECLARE_ASM_CONST(8, uint64_t, MM_FIX_0_198912367) = FIX64(0.198912367, 14);
> +
> +DECLARE_ASM_CONST(8, uint64_t, MM_DESCALE_RND) = C64(4);
> +DECLARE_ASM_CONST(8, uint64_t, MM_2) = C64(2);
> +
> +static void column_fidct_mmx(int16_t* thr_adr, int16_t *data, int16_t *output, int cnt)
> +{
> + DECLARE_ALIGNED(8, uint64_t, temps)[4];
> + __asm__ volatile(
> + ASMALIGN(4)
this fails to build, you can remove that line
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
I know you won't believe me, but the highest form of Human Excellence is
to question oneself and others. -- Socrates
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20141215/fb78788f/attachment.asc>
More information about the ffmpeg-devel
mailing list