[Ffmpeg-devel] H.264 encoder

Wed Oct 4 02:34:00 CEST 2006

Hi,

Attached to this e-mail are the MMX optimizations for the basic H.264 encoder.

With friendly regards,
Jori & Takis

-------------- next part --------------

diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c
index 6ec808e..2194442 100644
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -3225,6 +3225,15 @@ #endif //CONFIG_ENCODERS
         c->h264_idct8_dc_add=
         c->h264_idct8_add= ff_h264_idct8_add_mmx;
 
+        c->h264_dct = ff_h264_dct_mmx;
+        c->h264_idct_notranspose_add = ff_h264_idct_notranspose_add_mmx;
+        c->h264_hadamard_mult4x4 = ff_h264_hadamard_mult4x4_mmx;
+        c->h264_hadamard_quant_2x2 = ff_h264_hadamard_quant_2x2_mmx;
+        c->h264_hadamard_quant_4x4 = ff_h264_hadamard_quant_4x4_mmx;
+        c->h264_hadamard_invquant_4x4 = ff_h264_hadamard_invquant_4x4_mmx;
+        c->h264_transform_dct_quant = ff_h264_transform_dct_quant_mmx;
+        c->h264_transform_inverse_quant_dct_add = ff_h264_transform_inverse_quant_dct_add_mmx;
+
         if (mm_flags & MM_MMXEXT) {
             c->prefetch = prefetch_mmx2;
 
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c
index 83ee362..ca2b5cc 100644
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -46,6 +46,11 @@ #define IDCT4_1D( s02, s13, d02, d13, t 
     SUMSUBD2_AB( s13, d13, t )\
     SUMSUB_BADC( d13, s02, s13, d02 )
 
+#define SBUTTERFLY(a,b,t,n)\
+    "movq " #a ", " #t "                \n\t" /* abcd */\
+    "punpckl" #n " " #b ", " #a "       \n\t" /* aebf */\
+    "punpckh" #n " " #b ", " #t "       \n\t" /* cgdh */\
+
 #define TRANSPOSE4(a,b,c,d,t)\
     SBUTTERFLY(a,b,t,wd) /* a=aebf t=cgdh */\
     SBUTTERFLY(c,d,b,wd) /* c=imjn b=kolp */\
@@ -99,6 +104,131 @@ static void ff_h264_idct_add_mmx(uint8_t
     );
 }
 
+static void ff_h264_idct_notranspose_add_mmx(uint8_t *dst, int16_t *block, int stride)
+{
+    /* Load dct coeffs */
+    asm volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq  8(%0), %%mm4 \n\t"
+        "movq 16(%0), %%mm3 \n\t"
+        "movq 24(%0), %%mm1 \n\t"
+    TRANSPOSE4(%%mm0, %%mm4, %%mm3, %%mm1, %%mm2)
+    :: "r"(block) );
+
+    asm volatile(
+        /* mm1=s02+s13  mm2=s02-s13  mm4=d02+d13  mm0=d02-d13 */
+        IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
+
+        "movq      %0,    %%mm6 \n\t"
+        /* in: 1,4,0,2  out: 1,2,3,0 */
+        TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
+
+        "paddw     %%mm6, %%mm3 \n\t"
+
+        /* mm2=s02+s13  mm3=s02-s13  mm4=d02+d13  mm1=d02-d13 */
+        IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
+
+        "pxor %%mm7, %%mm7    \n\t"
+    :: "m"(ff_pw_32));
+
+    asm volatile(
+    STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
+        "add %1, %0             \n\t"
+    STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
+        : "+r"(dst)
+        : "r" ((long)stride)
+    );
+}
+
+#define FORWARD_DCT_PART(a,b,c,d,u,v,w,x) \
+        "movq  " #a ", " #u " \n\t" \
+    "paddw " #b ", " #u " \n\t" \
+    "paddw " #c ", " #u " \n\t" \
+    "paddw " #d ", " #u " \n\t" \
+    "movq  " #a ", " #w " \n\t" \
+    "psubw " #b ", " #w " \n\t" \
+    "psubw " #c ", " #w " \n\t" \
+    "paddw " #d ", " #w " \n\t" \
+    "movq  " #a ", " #x " \n\t" \
+    "psubw " #d ", " #x " \n\t" \
+    "movq  " #b ", " #v " \n\t" \
+    "psubw " #c ", " #v " \n\t" \
+    "movq  " #v ", " #a " \n\t" \
+    "movq  " #x ", " #b " \n\t" \
+    "psllw     $1, " #a " \n\t" \
+    "psllw     $1, " #b " \n\t" \
+    "paddw " #b ", " #v " \n\t" \
+    "psubw " #a ", " #x " \n\t"
+
+static void ff_h264_dct_mmx(int16_t inblock[4][4], int16_t outblock[4][4])
+{
+    /* Load dct coeffs */
+    asm volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq  8(%0), %%mm1 \n\t"
+        "movq 16(%0), %%mm2 \n\t"
+        "movq 24(%0), %%mm3 \n\t"
+    :: "r"(inblock) );
+
+    asm volatile(
+        FORWARD_DCT_PART( %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm6, %%mm7)
+    TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+    FORWARD_DCT_PART( %%mm4, %%mm7, %%mm0, %%mm6, %%mm1, %%mm2, %%mm3, %%mm5)
+    ::);
+    asm volatile(
+        "movq %%mm1,  (%0) \n\t"
+        "movq %%mm2, 8(%0) \n\t"
+        "movq %%mm3,16(%0) \n\t"
+        "movq %%mm5,24(%0) \n\t"
+    : "+r"(outblock)
+        : );
+}
+
+#define HADAMARD_MULT_PART(a,b,c,d,u,v,w,x) \
+        "movq  " #a ", " #u " \n\t"\
+    "paddw " #b ", " #u " \n\t"\
+    "movq  " #u ", " #v " \n\t"\
+    "movq  " #c ", " #w " \n\t"\
+    "paddw " #d ", " #w " \n\t"\
+    "paddw " #w ", " #u " \n\t"\
+    "psubw " #w ", " #v " \n\t"\
+    "psubw " #b ", " #a " \n\t"\
+    "movq  " #a ", " #w " \n\t"\
+    "movq  " #a ", " #x " \n\t"\
+    "psubw " #d ", " #c " \n\t"\
+    "paddw " #c ", " #x " \n\t"\
+    "psubw " #c ", " #w " \n\t"
+
+static void ff_h264_hadamard_mult4x4_mmx(int16_t Y[4][4])
+{
+    /* Load dct coeffs */
+    asm volatile(
+        "movq   (%0), %%mm4 \n\t"
+        "movq  8(%0), %%mm5 \n\t"
+        "movq 16(%0), %%mm6 \n\t"
+        "movq 24(%0), %%mm7 \n\t"
+    :: "r"(Y) );
+
+    asm volatile(
+        TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
+    HADAMARD_MULT_PART( %%mm4, %%mm7, %%mm0, %%mm6, %%mm1, %%mm2, %%mm3, %%mm5)
+    TRANSPOSE4(%%mm1, %%mm2, %%mm3, %%mm5, %%mm0)
+    HADAMARD_MULT_PART( %%mm1, %%mm5, %%mm0, %%mm3, %%mm2, %%mm4, %%mm6, %%mm7)
+    :: );
+    asm volatile(
+        "movq %%mm2,  (%0) \n\t"
+        "movq %%mm4, 8(%0) \n\t"
+        "movq %%mm6,16(%0) \n\t"
+        "movq %%mm7,24(%0) \n\t"
+    : "+r"(Y)
+        : );
+}
+
+
 static inline void h264_idct8_1d(int16_t *block)
 {
     asm volatile(
@@ -1349,6 +1479,7 @@ static void OPNAME ## h264_qpel ## SIZE 
 }\
 
 
+#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b "    \n\t"
 #define AVG_3DNOW_OP(a,b,temp, size) \
 "mov" #size " " #b ", " #temp "   \n\t"\
 "pavgusb " #temp ", " #a "        \n\t"\
@@ -1503,6 +1634,343 @@ static inline void ff_h264_biweight_WxH_
     }
 }
 
+extern const int16_t ff_h264_MF00[6];
+extern const int16_t ff_h264_V00[6];
+
+// hadamard quantization
+void ff_h264_transform_quantize2_mmx(const int16_t *outblock, const int16_t *MF, int16_t *result, const int32_t *f, const int32_t qbits[1])
+{
+    const static int64_t oneconst = 0x0001000100010001LL;
+
+    asm volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq   (%1), %%mm4 \n\t"
+        :: "r"(outblock), "r"(MF));
+    asm volatile(
+        "movq   %%mm0, %%mm7 \n\t"
+        "psraw    $14, %%mm7 \n\t"
+        "movq      %0, %%mm1 \n\t"
+        "movq   %%mm1, %%mm2 \n\t"
+        "psllw     $1, %%mm2 \n\t"
+        "pand   %%mm2, %%mm7 \n\t"
+        "psubw  %%mm7, %%mm1 \n\t" // mm1 now contains the signs of outblock[0..3]
+        "movq   %%mm0, %%mm7 \n\t"
+        "pmullw %%mm1, %%mm7 \n\t" // mm7 now contains ABS(outblock)
+        "psraw     $1, %%mm7 \n\t" // mm7 now contains ABS(outblock) >> 1
+        "movq   %%mm7, %%mm6 \n\t"
+        "pmullw %%mm4, %%mm6 \n\t"
+        "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of (ABS(outblock)>>1)*MF, mm7 contains the high words
+        "movq   %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+        "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from (ABS(outblock)>>1)*MF
+        "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from (ABS(outblock)>>1)*MF
+        "movq    (%1), %%mm3 \n\t"
+        "paddd  %%mm3, %%mm5 \n\t" // they now contain (ABS(outblock)>>1)*MF+f
+        "paddd  %%mm3, %%mm6 \n\t"
+        "movd    (%2), %%mm3 \n\t"
+        "psrld  %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain ((ABS(outblock)>>1)*MF+f) >> qbits
+        "psrld  %%mm3, %%mm6 \n\t"
+        "psraw     $1, %%mm1 \n\t"
+        "movq   %%mm1, %%mm2 \n\t"
+        "punpcklwd %%mm1,%%mm1 \n\t"
+        "punpckhwd %%mm2,%%mm2 \n\t"
+        "movq   %%mm5, %%mm0 \n\t"
+        "movq   %%mm6, %%mm3 \n\t"
+        "paddd  %%mm0, %%mm0 \n\t"
+        "paddd  %%mm3, %%mm3 \n\t"
+        "pand   %%mm1, %%mm0 \n\t"
+        "pand   %%mm2, %%mm3 \n\t"
+        "psubd  %%mm0, %%mm5 \n\t"
+        "psubd  %%mm3, %%mm6 \n\t"
+        "packssdw %%mm6, %%mm5 \n\t"
+        :: "m"(oneconst),"r"(f),"r"(qbits));
+    asm volatile(
+        "movq  %%mm5, (%0) \n\t"
+        : "+r"(result)
+        :
+        : "memory");
+
+}
+
+void ff_h264_transform_quantize_mmx(const int16_t *outblock, const int16_t *MF, int16_t *result, const int32_t *f, const int32_t qbits[1])
+{
+    const static int64_t oneconst = 0x0001000100010001LL;
+
+    asm volatile(
+        "movq   (%0), %%mm0 \n\t"
+        "movq   (%1), %%mm4 \n\t"
+        :: "r"(outblock), "r"(MF));
+    asm volatile(
+        "movq   %%mm0, %%mm7 \n\t"
+        "psraw    $14, %%mm7 \n\t"
+        "movq      %0, %%mm1 \n\t"
+        "movq   %%mm1, %%mm2 \n\t"
+        "psllw     $1, %%mm2 \n\t"
+        "pand   %%mm2, %%mm7 \n\t"
+        "psubw  %%mm7, %%mm1 \n\t" // mm1 now contains the signs of outblock[0..3]
+        "movq   %%mm0, %%mm7 \n\t"
+        "pmullw %%mm1, %%mm7 \n\t" // mm7 now contains ABS(outblock)
+        "movq   %%mm7, %%mm6 \n\t"
+        "pmullw %%mm4, %%mm6 \n\t"
+        "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of ABS(outblock)*MF, mm7 contains the high words
+        "movq   %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+        "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from ABS(outblock)*MF
+        "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from ABS(outblock)*MF
+        "movq    (%1), %%mm3 \n\t"
+        "paddd  %%mm3, %%mm5 \n\t" // they now contain ABS(outblock)*MF+f
+        "paddd  %%mm3, %%mm6 \n\t"
+        "movd    (%2), %%mm3 \n\t"
+        "psrld  %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain (ABS(outblock)*MF+f) >> qbits
+        "psrld  %%mm3, %%mm6 \n\t"
+        "psraw     $1, %%mm1 \n\t"
+        "movq   %%mm1, %%mm2 \n\t"
+        "punpcklwd %%mm1,%%mm1 \n\t"
+        "punpckhwd %%mm2,%%mm2 \n\t"
+        "movq   %%mm5, %%mm0 \n\t"
+        "movq   %%mm6, %%mm3 \n\t"
+        "paddd  %%mm0, %%mm0 \n\t"
+        "paddd  %%mm3, %%mm3 \n\t"
+        "pand   %%mm1, %%mm0 \n\t"
+        "pand   %%mm2, %%mm3 \n\t"
+        "psubd  %%mm0, %%mm5 \n\t"
+        "psubd  %%mm3, %%mm6 \n\t"
+        "packssdw %%mm6, %%mm5 \n\t"
+        :: "m"(oneconst),"r"(f),"r"(qbits));
+    asm volatile(
+        "movq  %%mm5, (%0) \n\t"
+        : "+r"(result)
+        :
+        : "memory");
+
+}
+
+extern const uint8_t div6[52];
+extern const uint8_t rem6[52];
+
+// we'll always work with transposed input blocks, to avoid having to make a distinction between
+// C and mmx implementations
+void ff_h264_transform_dct_quant_mmx(int16_t block[4][4], int QP, int dontscaleDC) // y,x indexing
+{
+    static const int16_t MF[6][4][4] =
+    {
+        { { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243}, { 13107, 8066, 13107, 8066}, {  8066, 5243,  8066, 5243} },
+        { { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660}, { 11916, 7490, 11916, 7490}, {  7490, 4660,  7490, 4660} },
+        { { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194}, { 10082, 6554, 10082, 6554}, {  6554, 4194,  6554, 4194} },
+        { {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647}, {  9362, 5825,  9362, 5825}, {  5825, 3647,  5825, 3647} },
+        { {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355}, {  8192, 5243,  8192, 5243}, {  5243, 3355,  5243, 3355} },
+        { {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893}, {  7282, 4559,  7282, 4559}, {  4559, 2893,  4559, 2893} }
+    };
+    int32_t qbits = 15 + div6[QP];
+    int32_t f = (1<<qbits)/3;
+    int mod = rem6[QP];
+    DCTELEM outblock[4][4];
+
+    ff_h264_dct_mmx(block, outblock);
+
+    {
+        int32_t qbits2[1];
+        int32_t f2[2];
+
+        qbits2[0] = qbits;
+        f2[0] = f;
+        f2[1] = f;
+        ff_h264_transform_quantize_mmx(&(outblock[0][0]),&(MF[mod][0][0]),&(block[0][0]),f2,qbits2);
+        ff_h264_transform_quantize_mmx(&(outblock[1][0]),&(MF[mod][1][0]),&(block[1][0]),f2,qbits2);
+        ff_h264_transform_quantize_mmx(&(outblock[2][0]),&(MF[mod][2][0]),&(block[2][0]),f2,qbits2);
+        ff_h264_transform_quantize_mmx(&(outblock[3][0]),&(MF[mod][3][0]),&(block[3][0]),f2,qbits2);
+
+        if (dontscaleDC)
+            block[0][0] = outblock[0][0];
+    }
+}
+
+void ff_h264_transform_inverse_quantize_highQP_mmx(const int16_t inblock[4], const int16_t V[4], int16_t outblock[4], const int32_t shift[1])
+{
+    asm volatile(
+        "movq   (%0), %%mm7 \n\t"
+        "movq   (%1), %%mm4 \n\t"
+        :: "r"(inblock), "r"(V));
+    asm volatile(
+        "movq   %%mm7, %%mm6 \n\t"
+        "pmullw %%mm4, %%mm6 \n\t"
+        "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of inblock*V, mm7 contains the high words
+        "movq   %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+        "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from inblock*V
+        "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from inblock*V
+        "movd    (%0), %%mm3 \n\t"
+        "pslld  %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain (inblock*V) << shift
+        "pslld  %%mm3, %%mm6 \n\t"
+        "packssdw %%mm6, %%mm5 \n\t"
+        :: "r"(shift));
+    asm volatile(
+        "movq  %%mm5, (%0) \n\t"
+        : "+r"(outblock)
+        :
+        : "memory");
+
+}
+
+void ff_h264_transform_inverse_quantize_lowQP_mmx(const int16_t inblock[4], const int16_t V[4], int16_t outblock[4], const int32_t add[2], const int32_t shift[1])
+{
+    asm volatile(
+        "movq   (%0), %%mm7 \n\t"
+        "movq   (%1), %%mm4 \n\t"
+        :: "r"(inblock), "r"(V));
+    asm volatile(
+        "movq   %%mm7, %%mm6 \n\t"
+        "pmullw %%mm4, %%mm6 \n\t"
+        "pmulhw %%mm4, %%mm7 \n\t" // mm6 now contains low words of inblock*V, mm7 contains the high words
+        "movq   %%mm6, %%mm5 \n\t" // now mm5 also contains low words
+        "punpcklwd %%mm7, %%mm5 \n\t" // mm5 should now contain the lowest 2 32-bit numbers from inblock*V
+        "punpckhwd %%mm7, %%mm6 \n\t" // mm6 should now contain the highest 2 32-bit numbers from inblock*V
+        "movq    (%1), %%mm3 \n\t"
+        "paddd  %%mm3, %%mm5 \n\t"
+        "paddd  %%mm3, %%mm6 \n\t"
+        "movd    (%0), %%mm3 \n\t"
+        "psrad  %%mm3, %%mm5 \n\t" // mm5 and mm6 now contain (inblock*V+add) >> shift
+        "psrad  %%mm3, %%mm6 \n\t"
+        "packssdw %%mm6, %%mm5 \n\t"
+        :: "r"(shift),"r"(add));
+    asm volatile(
+        "movq  %%mm5, (%0) \n\t"
+        : "+r"(outblock)
+        :
+        : "memory");
+
+}
+
+void ff_h264_transform_inverse_quant_dct_add_mmx(int16_t block[4][4], int QP, int dontscaleDC, uint8_t *dst, int stride) // y,x indexing
+{
+    static const int16_t V[6][4][4] =
+    {
+        { { 10*16, 13*16, 10*16, 13*16}, { 13*16, 16*16, 13*16, 16*16}, { 10*16, 13*16, 10*16, 13*16}, { 13*16, 16*16, 13*16, 16*16} },
+        { { 11*16, 14*16, 11*16, 14*16}, { 14*16, 18*16, 14*16, 18*16}, { 11*16, 14*16, 11*16, 14*16}, { 14*16, 18*16, 14*16, 18*16} },
+        { { 13*16, 16*16, 13*16, 16*16}, { 16*16, 20*16, 16*16, 20*16}, { 13*16, 16*16, 13*16, 16*16}, { 16*16, 20*16, 16*16, 20*16} },
+        { { 14*16, 18*16, 14*16, 18*16}, { 18*16, 23*16, 18*16, 23*16}, { 14*16, 18*16, 14*16, 18*16}, { 18*16, 23*16, 18*16, 23*16} },
+        { { 16*16, 20*16, 16*16, 20*16}, { 20*16, 25*16, 20*16, 25*16}, { 16*16, 20*16, 16*16, 20*16}, { 20*16, 25*16, 20*16, 25*16} },
+        { { 18*16, 23*16, 18*16, 23*16}, { 23*16, 29*16, 23*16, 29*16}, { 18*16, 23*16, 18*16, 23*16}, { 23*16, 29*16, 23*16, 29*16} }
+    };
+    DCTELEM elem[4][4];
+    int mod = rem6[QP];
+
+    if (QP >= 24)
+    {
+        int shift = div6[QP]-4;
+
+        int32_t shift1[1];
+        shift1[0] = shift;
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(block[0][0]),&(V[mod][0][0]),&(elem[0][0]),shift1);
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(block[1][0]),&(V[mod][1][0]),&(elem[1][0]),shift1);
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(block[2][0]),&(V[mod][2][0]),&(elem[2][0]),shift1);
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(block[3][0]),&(V[mod][3][0]),&(elem[3][0]),shift1);
+
+        if (dontscaleDC)
+            elem[0][0] = block[0][0];
+    }
+    else
+    {
+        int add = (1<<(3-div6[QP]));
+        int shift = (4-div6[QP]);
+        int32_t shift1[1];
+        int32_t add2[2];
+        shift1[0] = shift;
+        add2[0] = add;
+        add2[1] = add;
+
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[0][0]),&(V[mod][0][0]),&(elem[0][0]),add2,shift1);
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[1][0]),&(V[mod][1][0]),&(elem[1][0]),add2,shift1);
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[2][0]),&(V[mod][2][0]),&(elem[2][0]),add2,shift1);
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(block[3][0]),&(V[mod][3][0]),&(elem[3][0]),add2,shift1);
+        if (dontscaleDC)
+            elem[0][0] = block[0][0];
+    }
+
+    ff_h264_idct_add_c(dst, &(elem[0][0]), stride);
+}
+
+/**
+ * |ZD(i,j)| = (|YD(i,j)| MF(0,0) + 2 f) >> (qbits + 1)
+ *
+ */
+void ff_h264_hadamard_quant_4x4_mmx(DCTELEM Y[4][4], int QP)
+{
+    int qbits = 15 + div6[QP];
+    int f2 = ((1 << qbits) / 3)*2;
+    int shift = (qbits + 1);
+    int mod = rem6[QP];
+    int16_t mf = ff_h264_MF00[mod];
+    int16_t MF[4];
+    int32_t f[2];
+    int32_t qbits2[1];
+    MF[0] = mf;
+    MF[1] = mf;
+    MF[2] = mf;
+    MF[3] = mf;
+    f[0] = f2;
+    f[1] = f2;
+    qbits2[0] = shift;
+
+    ff_h264_transform_quantize2_mmx(&(Y[0][0]), MF, &(Y[0][0]), f, qbits2);
+    ff_h264_transform_quantize2_mmx(&(Y[1][0]), MF, &(Y[1][0]), f, qbits2);
+    ff_h264_transform_quantize2_mmx(&(Y[2][0]), MF, &(Y[2][0]), f, qbits2);
+    ff_h264_transform_quantize2_mmx(&(Y[3][0]), MF, &(Y[3][0]), f, qbits2);
+}
+
+/*
+ * Only if qpprime_y_zero_transform_bypass_flag == 0
+ */
+void ff_h264_hadamard_invquant_4x4_mmx(DCTELEM Y[4][4], int QP)
+{
+    int mod = rem6[QP];
+
+    if (QP < 36)
+    {
+        int qbits = div6[QP];
+        int shift = 6-qbits;
+        int f = (1 << (5-qbits));
+        int32_t shift1[1];
+        int32_t f2[2] = {f, f};
+        int16_t V = ff_h264_V00[mod];
+        int16_t V2[4] = {V, V, V, V};
+        shift1[0] = shift;
+
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[0][0]),V2,&(Y[0][0]),f2,shift1);
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[1][0]),V2,&(Y[1][0]),f2,shift1);
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[2][0]),V2,&(Y[2][0]),f2,shift1);
+        ff_h264_transform_inverse_quantize_lowQP_mmx(&(Y[3][0]),V2,&(Y[3][0]),f2,shift1);
+    }
+    else
+    {
+        int shift = div6[QP] - 6;
+        int32_t shift1[1];
+        int16_t V = ff_h264_V00[mod];
+        int16_t V2[4] = {V, V, V, V};
+        shift1[0] = shift;
+
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[0][0]),V2,&(Y[0][0]),shift1);
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[1][0]),V2,&(Y[1][0]),shift1);
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[2][0]),V2,&(Y[2][0]),shift1);
+        ff_h264_transform_inverse_quantize_highQP_mmx(&(Y[3][0]),V2,&(Y[3][0]),shift1);
+    }
+}
+
+/**
+ * |ZD(i,j)| = (|YD(i,j)| MF(0,0) + 2 f) >> (qbits + 1)
+ *
+ */
+void ff_h264_hadamard_quant_2x2_mmx(int16_t Y[2][2], int QP)
+{
+    int qbits = 15 + div6[QP];
+    int f2 = ((1 << qbits) / 3)*2;
+    int shift = qbits+1;
+    int32_t shift1[1];
+    int32_t f22[2] = {f2, f2};
+    int16_t MF = ff_h264_MF00[rem6[QP]];
+    int16_t MF2[4] = {MF, MF, MF, MF};
+    shift1[0] = shift;
+
+    ff_h264_transform_quantize_mmx(&(Y[0][0]),MF2,&(Y[0][0]),f22,shift1);
+}
+
 #define H264_WEIGHT(W,H) \
 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
     ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \