[FFmpeg-devel] [FFMpeg-Devel] [PATCH 4/7] Replaced BLOCK_HEIGHT macro with block_height and block_width variables
Michael Niedermayer
michaelni at gmx.at
Fri Mar 13 19:53:31 CET 2015
On Fri, Mar 13, 2015 at 02:15:11PM -0400, Tucker DiNapoli wrote:
> This change is to allow support for different sized blocks, which will
> be necessary for sse and avx. My plan is for the code to still act on
> 8x8 blocks, but to process multiple 8x8 blocks in parallel when using
> sse/avx.
> ---
> libpostproc/postprocess.c | 3 ---
> libpostproc/postprocess_c.c | 36 ++++++++++++++++++------------------
> libpostproc/postprocess_internal.h | 17 ++++++++++++++++-
> libpostproc/postprocess_template.c | 18 +++++++++---------
> 4 files changed, 43 insertions(+), 31 deletions(-)
>
> diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
> index 2cdd988..3090869 100644
> --- a/libpostproc/postprocess.c
> +++ b/libpostproc/postprocess.c
> @@ -115,9 +115,6 @@ const char *postproc_license(void)
>
> #define GET_MODE_BUFFER_SIZE 500
> #define OPTIONS_ARRAY_SIZE 10
> -#define BLOCK_SIZE 8
> -#define TEMP_STRIDE 8
> -//#define NUM_BLOCKS_AT_ONCE 16 //not used yet
>
> #if ARCH_X86 && HAVE_INLINE_ASM
> DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
> diff --git a/libpostproc/postprocess_c.c b/libpostproc/postprocess_c.c
> index 3d3b738..5660c64 100644
> --- a/libpostproc/postprocess_c.c
> +++ b/libpostproc/postprocess_c.c
> @@ -32,7 +32,7 @@ static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *
> const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
> const int dcThreshold= dcOffset*2 + 1;
>
> - for(y=0; y<BLOCK_SIZE; y++){
> + for(y=0; y<block_height; y++){
> numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
> numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
> numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
> @@ -56,7 +56,7 @@ static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c
> const int dcThreshold= dcOffset*2 + 1;
>
> src+= stride*4; // src points to begin of the 8x8 Block
> - for(y=0; y<BLOCK_SIZE-1; y++){
> + for(y=0; y<block_height-1; y++){
> numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
> numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
> numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
> @@ -90,7 +90,7 @@ static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
> {
> int x;
> src+= stride*4;
> - for(x=0; x<BLOCK_SIZE; x+=4){
> + for(x=0; x<block_width; x+=4){
> if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
> if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
> if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
> @@ -120,7 +120,7 @@ static inline int vertClassify_C(const uint8_t src[], int stride, const PPContex
> static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
> {
> int y;
> - for(y=0; y<BLOCK_SIZE; y++){
> + for(y=0; y<block_height; y++){
> const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
>
> if(FFABS(middleEnergy) < 8*c->QP){
> @@ -159,7 +159,7 @@ static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext
> static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
> {
> int y;
> - for(y=0; y<BLOCK_SIZE; y++){
> + for(y=0; y<block_height; y++){
> const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
> const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
>
> @@ -229,7 +229,7 @@ static inline void horizX1Filter(uint8_t *src, int stride, int QP)
> }
> }
>
> - for(y=0; y<BLOCK_SIZE; y++){
> + for(y=0; y<block_height; y++){
> int a= src[1] - src[2];
> int b= src[3] - src[4];
> int c= src[5] - src[6];
> @@ -392,7 +392,7 @@ static inline void doVertLowPass_C(uint8_t *src, int stride, PPContext *c)
> const int l9= stride + l8;
> int x;
> src+= stride*3;
> - for(x=0; x<BLOCK_SIZE; x++){
> + for(x=0; x<block_width; x++){
> const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
> const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
>
> @@ -443,7 +443,7 @@ static inline void vertX1Filter_C(uint8_t *src, int stride, PPContext *co)
> int x;
>
> src+= stride*3;
> - for(x=0; x<BLOCK_SIZE; x++){
> + for(x=0; x<block_width; x++){
> int a= src[l3] - src[l4];
> int b= src[l4] - src[l5];
> int c= src[l5] - src[l6];
> @@ -478,7 +478,7 @@ static inline void doVertDefFilter_C(uint8_t src[], int stride, PPContext *c)
> // const int l9= stride + l8;
> int x;
> src+= stride*3;
> - for(x=0; x<BLOCK_SIZE; x++){
> + for(x=0; x<block_width; x++){
> const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
> if(FFABS(middleEnergy) < 8*c->QP){
> const int q=(src[l4] - src[l5])/2;
> @@ -881,13 +881,13 @@ static inline void blockCopy_C(uint8_t dst[], int dstStride, const uint8_t src[]
> {
> int i;
> if(levelFix){
> - for(i=0; i<8; i++)
> + for(i=0; i<block_height; i++)
> memcpy( &(dst[dstStride*i]),
> - &(src[srcStride*i]), BLOCK_SIZE);
> + &(src[srcStride*i]), block_width);
> }else{
> - for(i=0; i<8; i++)
> + for(i=0; i<block_height; i++)
> memcpy( &(dst[dstStride*i]),
> - &(src[srcStride*i]), BLOCK_SIZE);
> + &(src[srcStride*i]), block_width);
> }
> }
>
> @@ -908,7 +908,7 @@ static inline void duplicate_C(uint8_t src[], int stride)
> * Filter array of bytes (Y or U or V values)
> */
> static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
> - const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
> + const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
> {
> DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
> int x,y;
> @@ -999,7 +999,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
> }
>
> /* copy & deinterlace first row of blocks */
> - y=-BLOCK_SIZE;
> + y=-block_height;
> {
> const uint8_t *srcBlock= &(src[y*srcStride]);
> uint8_t *dstBlock= tempDst + dstStride;
> @@ -1007,7 +1007,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
> // From this point on it is guaranteed that we can read and write 16 lines downward
> // finish 1 block before the next otherwise we might have a problem
> // with the L1 Cache of the P4 ... or only a few blocks at a time or something
> - for(x=0; x<width; x+=BLOCK_SIZE){
> + for(x=0; x<width; x+=block_width){
>
>
> blockCopy_C(dstBlock + dstStride*8, dstStride,
> @@ -1043,7 +1043,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
> }
> }
>
> - for(y=0; y<height; y+=BLOCK_SIZE){
> + for(y=0; y<height; y+=block_height){
> //1% speedup if these are here instead of the inner loop
> const uint8_t *srcBlock= &(src[y*srcStride]);
> uint8_t *dstBlock= &(dst[y*dstStride]);
> @@ -1077,7 +1077,7 @@ static void postProcess_C(const uint8_t src[], int srcStride, uint8_t dst[], int
> // From this point on it is guaranteed that we can read and write 16 lines downward
> // finish 1 block before the next otherwise we might have a problem
> // with the L1 Cache of the P4 ... or only a few blocks at a time or something
> - for(x=0; x<width; x+=BLOCK_SIZE){
> + for(x=0; x<width; x+=block_width){
> const int stride= dstStride;
> if(isColor){
> QP= QPptr[x>>qpHShift];
> diff --git a/libpostproc/postprocess_internal.h b/libpostproc/postprocess_internal.h
> index 1ebd974..5a7be1f 100644
> --- a/libpostproc/postprocess_internal.h
> +++ b/libpostproc/postprocess_internal.h
> @@ -174,5 +174,20 @@ static inline void linecpy(void *dest, const void *src, int lines, int stride) {
> memcpy((uint8_t*)dest+(lines-1)*stride, (const uint8_t*)src+(lines-1)*stride, -lines*stride);
> }
> }
> -
> +/*
> + Currently blocks are always 8xN bytes, where N is determined by the size of
> + the simd registers being used
> +*/
> +static const int block_height = 8;
> +#if ARCH_X86 && !CONFIG_RUNTIME_CPUDETECT
> +#if HAVE_AVX2
> +static const int block_width = 32;
> +#elif HAVE_SSE2
> +static const int block_width = 16;
> +#else
> +static const int block_width = 8;
> +#endif
> +#else
> +static int block_width; //determined at runtime
> +#endif
> #endif /* POSTPROC_POSTPROCESS_INTERNAL_H */
non constant globals are not allowed, there can be multiple instances
of the postprocess library and each might in theory use a different
set of cpu flags and optimizations
also this does not work
all tests fail:
TEST filter-pp
--- ./tests/ref/fate/filter-pp 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp 2015-03-13 19:49:45.559780559 +0100
@@ -1 +1 @@
-pp 3730f1ed7bf244ce059d110e21f39bbd
+pp
\ No newline at end of file
Test filter-pp failed. Look at tests/data/fate/filter-pp.err for details.
make: *** [fate-filter-pp] Error 139
TEST filter-pp1
--- ./tests/ref/fate/filter-pp1 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp1 2015-03-13 19:49:45.647780561 +0100
@@ -1 +1 @@
-pp1 cb9f884e27a5be11f72afc9b517efd10
+pp1
\ No newline at end of file
Test filter-pp1 failed. Look at tests/data/fate/filter-pp1.err for details.
make: *** [fate-filter-pp1] Error 139
TEST filter-pp2
--- ./tests/ref/fate/filter-pp2 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp2 2015-03-13 19:49:45.731780563 +0100
@@ -1 +1 @@
-pp2 975d4c1d489e42ddbd4a27464e8355af
+pp2
\ No newline at end of file
Test filter-pp2 failed. Look at tests/data/fate/filter-pp2.err for details.
make: *** [fate-filter-pp2] Error 139
TEST filter-pp3
--- ./tests/ref/fate/filter-pp3 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp3 2015-03-13 19:49:45.811780565 +0100
@@ -1 +1 @@
-pp3 ef0f10f1859af2f75717e8c9d64ee38a
+pp3
\ No newline at end of file
Test filter-pp3 failed. Look at tests/data/fate/filter-pp3.err for details.
make: *** [fate-filter-pp3] Error 139
TEST filter-pp4
--- ./tests/ref/fate/filter-pp4 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp4 2015-03-13 19:49:45.883780566 +0100
@@ -1 +1 @@
-pp4 0a2895c619ab9c6c22fd7cffb25070a8
+pp4
\ No newline at end of file
Test filter-pp4 failed. Look at tests/data/fate/filter-pp4.err for details.
make: *** [fate-filter-pp4] Error 139
TEST filter-pp5
--- ./tests/ref/fate/filter-pp5 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp5 2015-03-13 19:49:45.963780568 +0100
@@ -1 +1 @@
-pp5 5fc6703d42bd98942e5dd104ce220291
+pp5
\ No newline at end of file
Test filter-pp5 failed. Look at tests/data/fate/filter-pp5.err for details.
make: *** [fate-filter-pp5] Error 139
TEST filter-pp6
--- ./tests/ref/fate/filter-pp6 2015-03-13 19:34:25.103761168 +0100
+++ tests/data/fate/filter-pp6 2015-03-13 19:49:46.035780569 +0100
@@ -1 +1 @@
-pp6 93b508d07bfcf4703aa7dff2d2ef5c03
+pp6
\ No newline at end of file
Test filter-pp6 failed. Look at tests/data/fate/filter-pp6.err for details.
make: *** [fate-filter-pp6] Error 139
and i do not understand how this could work, the QP parameter is at
a granularity of 16x16 in the luma plane and 8x8 in the chroma planes
the existing code uses one QP or 2 adjacent ones while processing a
8x8 block. If you increase the block size there will be more QP
parameters to consider
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
If a bugfix only changes things apparently unrelated to the bug with no
further explanation, that is a good sign that the bugfix is wrong.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150313/32616914/attachment.asc>
More information about the ffmpeg-devel
mailing list