[Ffmpeg-devel] H.264 encoder
Panagiotis Issaris
takis.issaris
Wed Oct 4 02:34:00 CEST 2006
Hi,
Attached to this e-mail are the MMX optimizations for the basic H.264 encoder.
With friendly regards,
Jori & Takis
-------------- next part --------------
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 6ec808e..2194442 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -3225,6 +3225,15 @@ #endif //CONFIG_ENCODERS
c->h264_idct8_dc_add=
c->h264_idct8_add= ff_h264_idct8_add_mmx;
+ c->h264_dct = ff_h264_dct_mmx;
+ c->h264_idct_notranspose_add = ff_h264_idct_notranspose_add_mmx;
+ c->h264_hadamard_mult4x4 = ff_h264_hadamard_mult4x4_mmx;
+ c->h264_hadamard_quant_2x2 = ff_h264_hadamard_quant_2x2_mmx;
+ c->h264_hadamard_quant_4x4 = ff_h264_hadamard_quant_4x4_mmx;
+ c->h264_hadamard_invquant_4x4 = ff_h264_hadamard_invquant_4x4_mmx;
+ c->h264_transform_dct_quant = ff_h264_transform_dct_quant_mmx;
+ c->h264_transform_inverse_quant_dct_add = ff_h264_transform_inverse_quant_dct_add_mmx;
+
if (mm_flags & MM_MMXEXT) {
c->prefetch = prefetch_mmx2;
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
index 83ee362..ca2b5cc 100644
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -46,6 +46,11 @@ #define IDCT4_1D( s02, s13, d02, d13, t
SUMSUBD2_AB( s13, d13, t )\
SUMSUB_BADC( d13, s02, s13, d02 )
+#define SBUTTERFLY(a,b,t,n)\
+ "movq " #a ", " #t " \n\t" /* abcd */\
+ "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
+ "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
+
#define TRANSPOSE4(a,b,c,d,t)\
SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
@@ -99,6 +104,131 @@ static void ff_h264_idct_add_mmx(uint8_t
);
}
+static void ff_h264_idct_notranspose_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+ /* Load dct coeffs */
+ asm volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm4 \n\t"
+ "movq 16(%0), %%mm3 \n\t"
+ "movq 24(%0), %%mm1 \n\t"
+ TRANSPOSE4(%%mm0, %%mm4, %%mm3, %%mm1, %%mm2)
+ :: "r"(block) );
+
+ asm volatile(
+ /* mm1=s02+s13 mm2=s02-s13 mm4=d02+d13 mm0=d02-d13 */
+ IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
+
+ "movq %0, %%mm6 \n\t"
+ /* in: 1,4,0,2 out: 1,2,3,0 */
+ TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
+
+ "paddw %%mm6, %%mm3 \n\t"
+
+ /* mm2=s02+s13 mm3=s02-s13 mm4=d02+d13 mm1=d02-d13 */
+ IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
+
+ "pxor %%mm7, %%mm7 \n\t"
+ :: "m"(ff_pw_32));
+
+ asm volatile(
+ STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
+ "add %1, %0 \n\t"
+ STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
+ "add %1, %0 \n\t"
+ STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
+ "add %1, %0 \n\t"
+ STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
+ : "+r"(dst)
+ : "r" ((long)stride)
+ );
+}
+
+#define FORWARD_DCT_PART(a,b,c,d,u,v,w,x) \
+ "movq " #a ", " #u " \n\t" \
+ "paddw " #b ", " #u " \n\t" \
+ "paddw " #c ", " #u " \n\t" \
+ "paddw " #d ", " #u " \n\t" \
+ "movq " #a ", " #w " \n\t" \
+ "psubw " #b ", " #w " \n\t" \
+ "psubw " #c ", " #w " \n\t" \
+ "paddw " #d ", " #w " \n\t" \
+ "movq " #a ", " #x " \n\t" \
+ "psubw " #d ", " #x " \n\t" \
+ "movq " #b ", " #v " \n\t" \
+ "psubw " #c ", " #v " \n\t" \
+ "movq " #v ", " #a " \n\t" \
+ "movq " #x ", " #b " \n\t" \
+ "psllw $1, " #a " \n\t" \
+ "psllw $1, " #b " \n\t" \
+ "paddw " #b ", " #v " \n\t" \
+ "psubw " #a ", " #x " \n\t"
+
+static void ff_h264_dct_mmx(int16_t inblock[4][4], int16_t outblock[4][4])
+{
+ /* Load dct coeffs */
+ asm volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq 8(%0), %%mm1 \n\t"
+ "movq 16(%0), %%mm2 \n\t"
+ "movq 24(%0), %%mm3 \n\t"
+ :: "r"(inblock) );
+
+ asm volatile(
+ FORWARD_DCT_PART( %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
+ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+ FORWARD_DCT_PART( %%mm4, %%mm7, %%mm0, %%mm6, %%mm1, %%mm2, %%mm3, %%mm5)
+ ::);
+ asm volatile(
+ "movq %%mm1, (%0) \n\t"
+ "movq %%mm2, 8(%0) \n\t"
+ "movq %%mm3,16(%0) \n\t"
+ "movq %%mm5,24(%0) \n\t"
+ : "+r"(outblock)
+ : );
+}
+
+#define HADAMARD_MULT_PART(a,b,c,d,u,v,w,x) \
+ "movq " #a ", " #u " \n\t"\
+ "paddw " #b ", " #u " \n\t"\
+ "movq " #u ", " #v " \n\t"\
+ "movq " #c ", " #w " \n\t"\
+ "paddw " #d ", " #w " \n\t"\
+ "paddw " #w ", " #u " \n\t"\
+ "psubw " #w ", " #v " \n\t"\
+ "psubw " #b ", " #a " \n\t"\
+ "movq " #a ", " #w " \n\t"\
+ "movq " #a ", " #x " \n\t"\
+ "psubw " #d ", " #c " \n\t"\
+ "paddw " #c ", " #x " \n\t"\
+ "psubw " #c ", " #w " \n\t"
+
+static void ff_h264_hadamard_mult4x4_mmx(int16_t Y[4][4])
+{
+ /* Load dct coeffs */
+ asm volatile(
+ "movq (%0), %%mm4 \n\t"
+ "movq 8(%0), %%mm5 \n\t"
+ "movq 16(%0), %%mm6 \n\t"
+ "movq 24(%0), %%mm7 \n\t"
+ :: "r"(Y) );
+
+ asm volatile(
+ TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+ HADAMARD_MULT_PART( %%mm4, %%mm7, %%mm0, %%mm6, %%mm1, %%mm2, %%mm3, %%mm5)
+ TRANSPOSE4(%%mm1, %%mm2, %%mm3, %%mm5, %%mm0)
+ HADAMARD_MULT_PART( %%mm1, %%mm5, %%mm0, %%mm3, %%mm2, %%mm4, %%mm6, %%mm7)
+ :: );
+ asm volatile(
+ "movq %%mm2, (%0) \n\t"
+ "movq %%mm4, 8(%0) \n\t"
+ "movq %%mm6,16(%0) \n\t"
+ "movq %%mm7,24(%0) \n\t"
+ : "+r"(Y)
+ : );
+}
+
+
static inline void h264_idct8_1d(int16_t *block)
{
asm volatile(
@@ -1349,6 +1479,7 @@ static void OPNAME ## h264_qpel ## SIZE
}\
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
#define AVG_3DNOW_OP(a,b,temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgusb " #temp ", " #a " \n\t"\
@@ -1503,6 +1634,343 @@ static inline void ff_h264_biweight_WxH_
}
}
+extern const int16_t ff_h264_MF00[6];
+extern const int16_t ff_h264_V00[6];
+
+// hadamard quantization
+void ff_h264_transform_quantize2_mmx(const int16_t *outblock, const int16_t *MF, int16_t *result, const int32_t *f, const int32_t qbits[1])
+{
+ const static int64_t oneconst = 0x0001000100010001LL;
+
+ asm volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq (%1), %%mm4 \n\t"
+ :: "r"(outblock), "r"(MF));
+ asm volatile(
+ "movq %%mm0, %%mm7 \n\t"
+ "psraw $14, %%mm7 \n\t"
+ "movq %0, %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "psllw $1, %%mm2 \n\t"
+ "pand %%mm2, %%mm7 \n\t"
+ "psubw %%mm7, %%mm1 \n\t" // mm1 now contains the signs of outblock[0..3]
+ "movq %%mm0, %%mm7 \n\t"
+ "pmullw %%mm1, %%mm7 \n\t" // mm7 now contains ABS(outblock)
+ "psraw $1, %%mm7 \n\t" // mm7 now contains ABS(outblock) >> 1
+ "movq %%mm7, %%mm6 \n\t"
+ "pmullw %%mm4, %%mm6 \n\t"
+ "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of (ABS(outblock)>>1)*MF, mm7 contains the high words
+ "movq %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+ "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from (ABS(outblock)>>1)*MF
+ "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from (ABS(outblock)>>1)*MF
+ "movq (%1), %%mm3 \n\t"
+ "paddd %%mm3, %%mm5 \n\t" // they now contain (ABS(outblock)>>1)*MF+f
+ "paddd %%mm3, %%mm6 \n\t"
+ "movd (%2), %%mm3 \n\t"
+ "psrld %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain ((ABS(outblock)>>1)*MF+f) >> qbits
+ "psrld %%mm3, %%mm6 \n\t"
+ "psraw $1, %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "punpcklwd %%mm1,%%mm1 \n\t"
+ "punpckhwd %%mm2,%%mm2 \n\t"
+ "movq %%mm5, %%mm0 \n\t"
+ "movq %%mm6, %%mm3 \n\t"
+ "paddd %%mm0, %%mm0 \n\t"
+ "paddd %%mm3, %%mm3 \n\t"
+ "pand %%mm1, %%mm0 \n\t"
+ "pand %%mm2, %%mm3 \n\t"
+ "psubd %%mm0, %%mm5 \n\t"
+ "psubd %%mm3, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm5 \n\t"
+ :: "m"(oneconst),"r"(f),"r"(qbits));
+ asm volatile(
+ "movq %%mm5, (%0) \n\t"
+ : "+r"(result)
+ :
+ : "memory");
+
+}
+
+void ff_h264_transform_quantize_mmx(const int16_t *outblock, const int16_t *MF, int16_t *result, const int32_t *f, const int32_t qbits[1])
+{
+ const static int64_t oneconst = 0x0001000100010001LL;
+
+ asm volatile(
+ "movq (%0), %%mm0 \n\t"
+ "movq (%1), %%mm4 \n\t"
+ :: "r"(outblock), "r"(MF));
+ asm volatile(
+ "movq %%mm0, %%mm7 \n\t"
+ "psraw $14, %%mm7 \n\t"
+ "movq %0, %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "psllw $1, %%mm2 \n\t"
+ "pand %%mm2, %%mm7 \n\t"
+ "psubw %%mm7, %%mm1 \n\t" // mm1 now contains the signs of outblock[0..3]
+ "movq %%mm0, %%mm7 \n\t"
+ "pmullw %%mm1, %%mm7 \n\t" // mm7 now contains ABS(outblock)
+ "movq %%mm7, %%mm6 \n\t"
+ "pmullw %%mm4, %%mm6 \n\t"
+ "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of ABS(outblock)*MF, mm7 contains the high words
+ "movq %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+ "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from ABS(outblock)*MF
+ "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from ABS(outblock)*MF
+ "movq (%1), %%mm3 \n\t"
+ "paddd %%mm3, %%mm5 \n\t" // they now contain ABS(outblock)*MF+f
+ "paddd %%mm3, %%mm6 \n\t"
+ "movd (%2), %%mm3 \n\t"
+ "psrld %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain (ABS(outblock)*MF+f) >> qbits
+ "psrld %%mm3, %%mm6 \n\t"
+ "psraw $1, %%mm1 \n\t"
+ "movq %%mm1, %%mm2 \n\t"
+ "punpcklwd %%mm1,%%mm1 \n\t"
+ "punpckhwd %%mm2,%%mm2 \n\t"
+ "movq %%mm5, %%mm0 \n\t"
+ "movq %%mm6, %%mm3 \n\t"
+ "paddd %%mm0, %%mm0 \n\t"
+ "paddd %%mm3, %%mm3 \n\t"
+ "pand %%mm1, %%mm0 \n\t"
+ "pand %%mm2, %%mm3 \n\t"
+ "psubd %%mm0, %%mm5 \n\t"
+ "psubd %%mm3, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm5 \n\t"
+ :: "m"(oneconst),"r"(f),"r"(qbits));
+ asm volatile(
+ "movq %%mm5, (%0) \n\t"
+ : "+r"(result)
+ :
+ : "memory");
+
+}
+
+extern const uint8_t div6[52];
+extern const uint8_t rem6[52];
+
+// we'll always work with transposed input blocks, to avoid having to make a distinction between
+// C and mmx implementations
+void ff_h264_transform_dct_quant_mmx(int16_t block[4][4], int QP, int dontscaleDC) // y,x indexing
+{
+ static const int16_t MF[6][4][4] =
+ {
+ { { 13107, 8066, 13107, 8066}, { 8066, 5243, 8066, 5243}, { 13107, 8066, 13107, 8066}, { 8066, 5243, 8066, 5243} },
+ { { 11916, 7490, 11916, 7490}, { 7490, 4660, 7490, 4660}, { 11916, 7490, 11916, 7490}, { 7490, 4660, 7490, 4660} },
+ { { 10082, 6554, 10082, 6554}, { 6554, 4194, 6554, 4194}, { 10082, 6554, 10082, 6554}, { 6554, 4194, 6554, 4194} },
+ { { 9362, 5825, 9362, 5825}, { 5825, 3647, 5825, 3647}, { 9362, 5825, 9362, 5825}, { 5825, 3647, 5825, 3647} },
+ { { 8192, 5243, 8192, 5243}, { 5243, 3355, 5243, 3355}, { 8192, 5243, 8192, 5243}, { 5243, 3355, 5243, 3355} },
+ { { 7282, 4559, 7282, 4559}, { 4559, 2893, 4559, 2893}, { 7282, 4559, 7282, 4559}, { 4559, 2893, 4559, 2893} }
+ };
+ int32_t qbits = 15 + div6[QP];
+ int32_t f = (1<<qbits)/3;
+ int mod = rem6[QP];
+ DCTELEM outblock[4][4];
+
+ ff_h264_dct_mmx(block, outblock);
+
+ {
+ int32_t qbits2[1];
+ int32_t f2[2];
+
+ qbits2[0] = qbits;
+ f2[0] = f;
+ f2[1] = f;
+ ff_h264_transform_quantize_mmx(&(outblock[0][0]),&(MF[mod][0][0]),&(block[0][0]),f2,qbits2);
+ ff_h264_transform_quantize_mmx(&(outblock[1][0]),&(MF[mod][1][0]),&(block[1][0]),f2,qbits2);
+ ff_h264_transform_quantize_mmx(&(outblock[2][0]),&(MF[mod][2][0]),&(block[2][0]),f2,qbits2);
+ ff_h264_transform_quantize_mmx(&(outblock[3][0]),&(MF[mod][3][0]),&(block[3][0]),f2,qbits2);
+
+ if (dontscaleDC)
+ block[0][0] = outblock[0][0];
+ }
+}
+
+void ff_h264_transform_inverse_quantize_highQP_mmx(const int16_t inblock[4], const int16_t V[4], int16_t outblock[4], const int32_t shift[1])
+{
+ asm volatile(
+ "movq (%0), %%mm7 \n\t"
+ "movq (%1), %%mm4 \n\t"
+ :: "r"(inblock), "r"(V));
+ asm volatile(
+ "movq %%mm7, %%mm6 \n\t"
+ "pmullw %%mm4, %%mm6 \n\t"
+ "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of inblock*V, mm7 contains the high words
+ "movq %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+ "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from inblock*V
+ "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from inblock*V
+ "movd (%0), %%mm3 \n\t"
+ "pslld %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain (inblock*V) << shift
+ "pslld %%mm3, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm5 \n\t"
+ :: "r"(shift));
+ asm volatile(
+ "movq %%mm5, (%0) \n\t"
+ : "+r"(outblock)
+ :
+ : "memory");
+
+}
+
+void ff_h264_transform_inverse_quantize_lowQP_mmx(const int16_t inblock[4], const int16_t V[4], int16_t outblock[4], const int32_t add[2], const int32_t shift[1])
+{
+ asm volatile(
+ "movq (%0), %%mm7 \n\t"
+ "movq (%1), %%mm4 \n\t"
+ :: "r"(inblock), "r"(V));
+ asm volatile(
+ "movq %%mm7, %%mm6 \n\t"
+ "pmullw %%mm4, %%mm6 \n\t"
+ "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of inblock*V, mm7 contains the high words
+ "movq %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+ "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from inblock*V
+ "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from inblock*V
+ "movq (%1), %%mm3 \n\t"
+ "paddd %%mm3, %%mm5 \n\t"
+ "paddd %%mm3, %%mm6 \n\t"
+ "movd (%0), %%mm3 \n\t"
+ "psrad %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain (inblock*V+add) >> shift
+ "psrad %%mm3, %%mm6 \n\t"
+ "packssdw %%mm6, %%mm5 \n\t"
+ :: "r"(shift),"r"(add));
+ asm volatile(
+ "movq %%mm5, (%0) \n\t"
+ : "+r"(outblock)
+ :
+ : "memory");
+
+}
+
+void ff_h264_transform_inverse_quant_dct_add_mmx(int16_t block[4][4], int QP, int dontscaleDC, uint8_t *dst, int stride) // y,x indexing
+{
+ static const int16_t V[6][4][4] =
+ {
+ { { 10*16, 13*16, 10*16, 13*16}, { 13*16, 16*16, 13*16, 16*16}, { 10*16, 13*16, 10*16, 13*16}, { 13*16, 16*16, 13*16, 16*16} },
+ { { 11*16, 14*16, 11*16, 14*16}, { 14*16, 18*16, 14*16, 18*16}, { 11*16, 14*16, 11*16, 14*16}, { 14*16, 18*16, 14*16, 18*16} },
+ { { 13*16, 16*16, 13*16, 16*16}, { 16*16, 20*16, 16*16, 20*16}, { 13*16, 16*16, 13*16, 16*16}, { 16*16, 20*16, 16*16, 20*16} },
+ { { 14*16, 18*16, 14*16, 18*16}, { 18*16, 23*16, 18*16, 23*16}, { 14*16, 18*16, 14*16, 18*16}, { 18*16, 23*16, 18*16, 23*16} },
+ { { 16*16, 20*16, 16*16, 20*16}, { 20*16, 25*16, 20*16, 25*16}, { 16*16, 20*16, 16*16, 20*16}, { 20*16, 25*16, 20*16, 25*16} },
+ { { 18*16, 23*16, 18*16, 23*16}, { 23*16, 29*16, 23*16, 29*16}, { 18*16, 23*16, 18*16, 23*16}, { 23*16, 29*16, 23*16, 29*16} }
+ };
+ DCTELEM elem[4][4];
+ int mod = rem6[QP];
+
+ if (QP >= 24)
+ {
+ int shift = div6[QP]-4;
+
+ int32_t shift1[1];
+ shift1[0] = shift;
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(block[0][0]),&(V[mod][0][0]),&(elem[0][0]),shift1);
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(block[1][0]),&(V[mod][1][0]),&(elem[1][0]),shift1);
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(block[2][0]),&(V[mod][2][0]),&(elem[2][0]),shift1);
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(block[3][0]),&(V[mod][3][0]),&(elem[3][0]),shift1);
+
+ if (dontscaleDC)
+ elem[0][0] = block[0][0];
+ }
+ else
+ {
+ int add = (1<<(3-div6[QP]));
+ int shift = (4-div6[QP]);
+ int32_t shift1[1];
+ int32_t add2[2];
+ shift1[0] = shift;
+ add2[0] = add;
+ add2[1] = add;
+
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[0][0]),&(V[mod][0][0]),&(elem[0][0]),add2,shift1);
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[1][0]),&(V[mod][1][0]),&(elem[1][0]),add2,shift1);
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[2][0]),&(V[mod][2][0]),&(elem[2][0]),add2,shift1);
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[3][0]),&(V[mod][3][0]),&(elem[3][0]),add2,shift1);
+ if (dontscaleDC)
+ elem[0][0] = block[0][0];
+ }
+
+ ff_h264_idct_add_c(dst, &(elem[0][0]), stride);
+}
+
+/**
+ * |ZD(i,j)| = (|YD(i,j)| MF(0,0) + 2 f) >> (qbits + 1)
+ *
+ */
+void ff_h264_hadamard_quant_4x4_mmx(DCTELEM Y[4][4], int QP)
+{
+ int qbits = 15 + div6[QP];
+ int f2 = ((1 << qbits) / 3)*2;
+ int shift = (qbits + 1);
+ int mod = rem6[QP];
+ int16_t mf = ff_h264_MF00[mod];
+ int16_t MF[4];
+ int32_t f[2];
+ int32_t qbits2[1];
+ MF[0] = mf;
+ MF[1] = mf;
+ MF[2] = mf;
+ MF[3] = mf;
+ f[0] = f2;
+ f[1] = f2;
+ qbits2[0] = shift;
+
+ ff_h264_transform_quantize2_mmx(&(Y[0][0]), MF, &(Y[0][0]), f, qbits2);
+ ff_h264_transform_quantize2_mmx(&(Y[1][0]), MF, &(Y[1][0]), f, qbits2);
+ ff_h264_transform_quantize2_mmx(&(Y[2][0]), MF, &(Y[2][0]), f, qbits2);
+ ff_h264_transform_quantize2_mmx(&(Y[3][0]), MF, &(Y[3][0]), f, qbits2);
+}
+
+/*
+ * Only if qpprime_y_zero_transform_bypass_flag == 0
+ */
+void ff_h264_hadamard_invquant_4x4_mmx(DCTELEM Y[4][4], int QP)
+{
+ int mod = rem6[QP];
+
+ if (QP < 36)
+ {
+ int qbits = div6[QP];
+ int shift = 6-qbits;
+ int f = (1 << (5-qbits));
+ int32_t shift1[1];
+ int32_t f2[2] = {f, f};
+ int16_t V = ff_h264_V00[mod];
+ int16_t V2[4] = {V, V, V, V};
+ shift1[0] = shift;
+
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[0][0]),V2,&(Y[0][0]),f2,shift1);
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[1][0]),V2,&(Y[1][0]),f2,shift1);
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[2][0]),V2,&(Y[2][0]),f2,shift1);
+ ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[3][0]),V2,&(Y[3][0]),f2,shift1);
+ }
+ else
+ {
+ int shift = div6[QP] - 6;
+ int32_t shift1[1];
+ int16_t V = ff_h264_V00[mod];
+ int16_t V2[4] = {V, V, V, V};
+ shift1[0] = shift;
+
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[0][0]),V2,&(Y[0][0]),shift1);
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[1][0]),V2,&(Y[1][0]),shift1);
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[2][0]),V2,&(Y[2][0]),shift1);
+ ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[3][0]),V2,&(Y[3][0]),shift1);
+ }
+}
+
+/**
+ * |ZD(i,j)| = (|YD(i,j)| MF(0,0) + 2 f) >> (qbits + 1)
+ *
+ */
+void ff_h264_hadamard_quant_2x2_mmx(int16_t Y[2][2], int QP)
+{
+ int qbits = 15 + div6[QP];
+ int f2 = ((1 << qbits) / 3)*2;
+ int shift = qbits+1;
+ int32_t shift1[1];
+ int32_t f22[2] = {f2, f2};
+ int16_t MF = ff_h264_MF00[rem6[QP]];
+ int16_t MF2[4] = {MF, MF, MF, MF};
+ shift1[0] = shift;
+
+ ff_h264_transform_quantize_mmx(&(Y[0][0]),MF2,&(Y[0][0]),f22,shift1);
+}
+
#define H264_WEIGHT(W,H) \
static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
More information about the ffmpeg-devel
mailing list