diff -urp ffmpeg-old/libavcodec/i386/dsputil_mmx.c ffmpeg/libavcodec/i386/dsputil_mmx.c --- ffmpeg-old/libavcodec/i386/dsputil_mmx.c 2007-01-27 20:35:52.000000000 +0100 +++ ffmpeg/libavcodec/i386/dsputil_mmx.c 2007-01-28 00:38:01.000000000 +0100 @@ -655,15 +655,14 @@ static inline void transpose4x4(uint8_t "punpckhwd %%mm2, %%mm1 \n\t" "movd %%mm0, %0 \n\t" "punpckhdq %%mm0, %%mm0 \n\t" - "movd %%mm0, %1 \n\t" - "movd %%mm1, %2 \n\t" + "movd %%mm0, (%0,%1) \n\t" + "movd %%mm1, (%0,%1,2) \n\t" "punpckhdq %%mm1, %%mm1 \n\t" - "movd %%mm1, %3 \n\t" + "lea (%1,%1,2), %1 \n\t" + "movd %%mm1, (%0,%1) \n\t" - : "=m" (*(uint32_t*)(dst + 0*dst_stride)), - "=m" (*(uint32_t*)(dst + 1*dst_stride)), - "=m" (*(uint32_t*)(dst + 2*dst_stride)), - "=m" (*(uint32_t*)(dst + 3*dst_stride)) + : "=r" (*(uint32_t*)(dst)), "+r" (dst_stride) + :: "memory" ); } @@ -1743,7 +1742,7 @@ WARPER8_16_SQ(hadamard8_diff_mmx2, hadam #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\ "paddw " #m4 ", " #m3 " \n\t" /* x1 */\ - "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\ + "movq %5, %%mm4 \n\t" /* 20 */\ "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\ "movq "#in7", " #m3 " \n\t" /* d */\ "movq "#in0", %%mm5 \n\t" /* D */\ @@ -1755,7 +1754,7 @@ WARPER8_16_SQ(hadamard8_diff_mmx2, hadam "paddw " #m5 ", %%mm6 \n\t" /* x2 */\ "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\ "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\ - "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\ + "pmullw %6, %%mm5 \n\t" /* -6x2 + 3x3 */\ "paddw " #rnd ", %%mm4 \n\t" /* x2 */\ "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\ "psraw $5, %%mm5 \n\t"\ @@ -1789,15 +1788,15 @@ static void OPNAME ## mpeg4_qpel16_h_low "paddw %%mm5, %%mm5 \n\t" /* 2b */\ "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ + "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\ "paddw %%mm4, %%mm0 \n\t" /* a */\ "paddw %%mm1, %%mm5 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ + "pmullw %5, %%mm0 \n\t" /* 20a */\ "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ - "paddw %6, %%mm6 \n\t"\ + "paddw %8, %%mm6 \n\t"\ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm0 \n\t"\ - "movq %%mm0, %5 \n\t"\ + "movq %%mm0, %7 \n\t"\ /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ \ "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\ @@ -1815,15 +1814,15 @@ static void OPNAME ## mpeg4_qpel16_h_low "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\ "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\ "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ + "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\ "paddw %%mm2, %%mm1 \n\t" /* a */\ "paddw %%mm6, %%mm4 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ + "pmullw %5, %%mm1 \n\t" /* 20a */\ "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\ - "paddw %6, %%mm1 \n\t"\ + "paddw %8, %%mm1 \n\t"\ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\ "psraw $5, %%mm3 \n\t"\ - "movq %5, %%mm1 \n\t"\ + "movq %7, %%mm1 \n\t"\ "packuswb %%mm3, %%mm1 \n\t"\ OP_MMX2(%%mm1, (%1),%%mm4, q)\ /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\ @@ -1841,7 +1840,7 @@ static void OPNAME ## mpeg4_qpel16_h_low "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\ "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\ "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\ - "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\ + "pmullw %6, %%mm0 \n\t" /* 3c - 6b */\ "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\ "paddw %%mm3, %%mm2 \n\t" /* d */\ "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\ @@ -1849,8 +1848,8 @@ static void OPNAME ## mpeg4_qpel16_h_low "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\ "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\ "paddw %%mm2, %%mm6 \n\t" /* a */\ - "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\ - "paddw %6, %%mm0 \n\t"\ + "pmullw %5, %%mm6 \n\t" /* 20a */\ + "paddw %8, %%mm0 \n\t"\ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm0 \n\t"\ /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\ @@ -1864,10 +1863,10 @@ static void OPNAME ## mpeg4_qpel16_h_low "paddw %%mm2, %%mm5 \n\t" /* d */\ "paddw %%mm6, %%mm6 \n\t" /* 2b */\ "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\ - "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\ - "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\ + "pmullw %5, %%mm3 \n\t" /* 20a */\ + "pmullw %6, %%mm4 \n\t" /* 3c - 6b */\ "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\ - "paddw %6, %%mm4 \n\t"\ + "paddw %8, %%mm4 \n\t"\ "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm4 \n\t"\ "packuswb %%mm4, %%mm0 \n\t"\ @@ -1877,8 +1876,8 @@ static void OPNAME ## mpeg4_qpel16_h_low "add %4, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+m"(h)\ - : "d"((long)srcStride), "S"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ + : : "a"(src), "c"(dst), "m"(h), \ + "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\ : "memory"\ );\ }\ @@ -1956,12 +1955,12 @@ static void OPNAME ## mpeg4_qpel8_h_lowp "paddw %%mm5, %%mm5 \n\t" /* 2b */\ "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\ "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\ - "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\ + "pmullw %6, %%mm6 \n\t" /* 3c - 6b */\ "paddw %%mm4, %%mm0 \n\t" /* a */\ "paddw %%mm1, %%mm5 \n\t" /* d */\ - "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\ + "pmullw %5, %%mm0 \n\t" /* 20a */\ "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\ - "paddw %6, %%mm6 \n\t"\ + "paddw %8, %%mm6 \n\t"\ "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm0 \n\t"\ /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\ @@ -1977,10 +1976,10 @@ static void OPNAME ## mpeg4_qpel8_h_lowp "paddw %%mm5, %%mm4 \n\t" /* d */\ "paddw %%mm2, %%mm2 \n\t" /* 2b */\ "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\ - "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\ - "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\ + "pmullw %5, %%mm1 \n\t" /* 20a */\ + "pmullw %6, %%mm3 \n\t" /* 3c - 6b */\ "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\ - "paddw %6, %%mm1 \n\t"\ + "paddw %8, %%mm1 \n\t"\ "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\ "psraw $5, %%mm3 \n\t"\ "packuswb %%mm3, %%mm0 \n\t"\ @@ -1990,8 +1989,8 @@ static void OPNAME ## mpeg4_qpel8_h_lowp "add %4, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ - : "+a"(src), "+c"(dst), "+m"(h)\ - : "S"((long)srcStride), "D"((long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\ + : : "a"(src), "c"(dst), "m"(h), \ + "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(temp), "m"(ROUNDER)\ : "memory"\ );\ }\ @@ -2070,39 +2069,39 @@ static void OPNAME ## mpeg4_qpel16_v_low "movq 8(%0), %%mm1 \n\t"\ "movq 16(%0), %%mm2 \n\t"\ "movq 24(%0), %%mm3 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\ \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\ "add %4, %1 \n\t" \ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\ \ "add $136, %0 \n\t"\ - "add %6, %1 \n\t"\ + "add %8, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ - : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(long)dstStride)\ + : : "r"(temp_ptr), "r"(dst), "g"(count), \ + "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-14*(long)dstStride)\ :"memory"\ );\ }\ @@ -2142,27 +2141,27 @@ static void OPNAME ## mpeg4_qpel8_v_lowp "movq 8(%0), %%mm1 \n\t"\ "movq 16(%0), %%mm2 \n\t"\ "movq 24(%0), %%mm3 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, (%0), (%0), 8(%0), 48(%0), (%1), OP)\ \ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ + QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %7, %8, %7, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\ \ - QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %7, %8, %7, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\ "add %4, %1 \n\t"\ - QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ - QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ + QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %7, %8, %7, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\ + QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %7, %8, %7, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\ \ "add $72, %0 \n\t"\ - "add %6, %1 \n\t"\ + "add %8, %1 \n\t"\ "decl %2 \n\t"\ " jnz 1b \n\t"\ \ - : "+r"(temp_ptr), "+r"(dst), "+g"(count)\ - : "r"((long)dstStride), "r"(2*(long)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(long)dstStride)\ + : : "r"(temp_ptr), "r"(dst), "rm"(count), \ + "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ff_pw_20), "m"(ff_pw_3), "m"(ROUNDER), "g"(4-6*(long)dstStride)\ : "memory"\ );\ }\ diff -urp ffmpeg-old/libavcodec/i386/h264dsp_mmx.c ffmpeg/libavcodec/i386/h264dsp_mmx.c --- ffmpeg-old/libavcodec/i386/h264dsp_mmx.c 2006-10-17 14:24:19.000000000 +0200 +++ ffmpeg/libavcodec/i386/h264dsp_mmx.c 2007-01-28 01:45:42.000000000 +0100 @@ -600,22 +600,26 @@ static void h264_loop_filter_strength_mm "paddb %%mm6, %%mm1 \n\t" "punpckhbw %%mm7, %%mm1 \n\t" // ref[b] != ref[bn] "por %%mm1, %%mm0 \n\t" + ::"m"(ref[l][b_idx]), + "m"(ref[l][b_idx+d_idx]) + : "memory" + ); - "movq %2, %%mm1 \n\t" - "movq %3, %%mm2 \n\t" - "psubw %4, %%mm1 \n\t" - "psubw %5, %%mm2 \n\t" + asm volatile( + "movq %0, %%mm1 \n\t" + "movq %1, %%mm2 \n\t" + "psubw %2, %%mm1 \n\t" + "psubw %3, %%mm2 \n\t" "packsswb %%mm2, %%mm1 \n\t" "paddb %%mm5, %%mm1 \n\t" "pminub %%mm4, %%mm1 \n\t" "pcmpeqb %%mm4, %%mm1 \n\t" // abs(mv[b] - mv[bn]) >= limit "por %%mm1, %%mm0 \n\t" - ::"m"(ref[l][b_idx]), - "m"(ref[l][b_idx+d_idx]), - "m"(mv[l][b_idx][0]), + ::"m"(mv[l][b_idx][0]), "m"(mv[l][b_idx+2][0]), "m"(mv[l][b_idx+d_idx][0]), "m"(mv[l][b_idx+d_idx+2][0]) + : "memory" ); } } diff -urp ffmpeg-old/libavcodec/i386/motion_est_mmx.c ffmpeg/libavcodec/i386/motion_est_mmx.c --- ffmpeg-old/libavcodec/i386/motion_est_mmx.c 2006-10-17 14:24:19.000000000 +0200 +++ ffmpeg/libavcodec/i386/motion_est_mmx.c 2007-01-27 22:43:34.000000000 +0100 @@ -121,7 +121,7 @@ static inline void sad8_4_mmx2(uint8_t * long len= -(stride*h); asm volatile( ASMALIGN(4) - "movq "MANGLE(bone)", %%mm5 \n\t" + "movq %5, %%mm5 \n\t" "1: \n\t" "movq (%1, %%"REG_a"), %%mm0 \n\t" "movq (%2, %%"REG_a"), %%mm2 \n\t" @@ -149,7 +149,7 @@ static inline void sad8_4_mmx2(uint8_t * "add %4, %%"REG_a" \n\t" " js 1b \n\t" : "+a" (len) - : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride) + : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride), "m" (bone) ); } diff -urp ffmpeg-old/libavcodec/i386/mpegvideo_mmx_template.c ffmpeg/libavcodec/i386/mpegvideo_mmx_template.c --- ffmpeg-old/libavcodec/i386/mpegvideo_mmx_template.c 2006-10-17 14:24:19.000000000 +0200 +++ ffmpeg/libavcodec/i386/mpegvideo_mmx_template.c 2007-01-28 01:44:09.000000000 +0100 @@ -110,7 +110,7 @@ static int RENAME(dct_quantize)(MpegEncC SPREADW(%%mm3) "pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0 - "movq (%2), %%mm5 \n\t" // qmat[0] + "movq %2, %%mm5 \n\t" // qmat[0] "pxor %%mm6, %%mm6 \n\t" "psubw (%3), %%mm6 \n\t" // -bias[0] "mov $-128, %%"REG_a" \n\t" @@ -138,7 +138,7 @@ static int RENAME(dct_quantize)(MpegEncC "movd %%mm3, %%"REG_a" \n\t" "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat), "r" (bias), + : "r" (block+64), "m" (qmat), "r" (bias), "r" (inv_zigzag_direct16+64), "r" (temp_block+64) ); // note the asm is split cuz gcc doesnt like that many operands ... @@ -153,15 +153,19 @@ static int RENAME(dct_quantize)(MpegEncC ); }else{ // FMT_H263 asm volatile( - "movd %%"REG_a", %%mm3 \n\t" // last_non_zero_p1 + "movd %0, %%mm3 \n\t" // last_non_zero_p1 SPREADW(%%mm3) "pxor %%mm7, %%mm7 \n\t" // 0 "pxor %%mm4, %%mm4 \n\t" // 0 + "push %%"REG_a" \n\t" "mov $-128, %%"REG_a" \n\t" ASMALIGN(4) "1: \n\t" "pxor %%mm1, %%mm1 \n\t" // 0 - "movq (%1, %%"REG_a"), %%mm0 \n\t" // block[i] + "push %%"REG_a" \n\t" + "add %c6(%%"REG_SP"), %%"REG_a" \n\t" + "movq (%%"REG_a"), %%mm0 \n\t" // block[i] + "pop %%"REG_a" \n\t" "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00 "pxor %%mm1, %%mm0 \n\t" "psubw %%mm1, %%mm0 \n\t" // ABS(block[i]) @@ -183,9 +187,12 @@ static int RENAME(dct_quantize)(MpegEncC PMAX(%%mm3, %%mm0) "movd %%mm3, %%"REG_a" \n\t" "movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 - : "+a" (last_non_zero_p1) - : "r" (block+64), "r" (qmat+64), "r" (bias+64), - "r" (inv_zigzag_direct16+64), "r" (temp_block+64) + "mov %%"REG_a", %0 \n\t" + "pop %%"REG_a" \n\t" + : "+m" (last_non_zero_p1) + : "a" (block+64), "r" (qmat+64), "r" (bias+64), + "r" (inv_zigzag_direct16+64), "r" (temp_block+64), + "i" (sizeof(long)) ); // note the asm is split cuz gcc doesnt like that many operands ... asm volatile( diff -urp ffmpeg-old/libavcodec/i386/simple_idct_mmx.c ffmpeg/libavcodec/i386/simple_idct_mmx.c --- ffmpeg-old/libavcodec/i386/simple_idct_mmx.c 2006-10-17 14:24:19.000000000 +0200 +++ ffmpeg/libavcodec/i386/simple_idct_mmx.c 2007-01-27 22:43:34.000000000 +0100 @@ -363,7 +363,7 @@ static inline void idct(int16_t *block) "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ + "movq %3, %%mm4 \n\t"\ "pand %%mm0, %%mm4 \n\t"\ "por %%mm1, %%mm4 \n\t"\ "por %%mm2, %%mm4 \n\t"\ @@ -471,7 +471,7 @@ COL_IDCT( 24(%1), 88(%1), 56(%1), 120(% "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ - "movq "MANGLE(wm1010)", %%mm4 \n\t"\ + "movq %3, %%mm4 \n\t"\ "pand %%mm0, %%mm4 \n\t"\ "por %%mm1, %%mm4 \n\t"\ "por %%mm2, %%mm4 \n\t"\ @@ -545,7 +545,7 @@ COL_IDCT( 24(%1), 88(%1), 56(%1), 120(% "jmp 2f \n\t"\ "1: \n\t"\ "pslld $16, %%mm0 \n\t"\ - "paddd "MANGLE(d40000)", %%mm0 \n\t"\ + "paddd %4, %%mm0 \n\t"\ "psrad $13, %%mm0 \n\t"\ "packssdw %%mm0, %%mm0 \n\t"\ "movq %%mm0, " #dst " \n\t"\ @@ -1270,7 +1270,7 @@ Temp */ "9: \n\t" - :: "r" (block), "r" (temp), "r" (coeffs) + :: "r" (block), "r" (temp), "r" (coeffs), "m" (wm1010), "m" (d40000) : "%eax" ); } diff -urp ffmpeg-old/libavcodec/i386/snowdsp_mmx.c ffmpeg/libavcodec/i386/snowdsp_mmx.c --- ffmpeg-old/libavcodec/i386/snowdsp_mmx.c 2006-10-17 14:24:19.000000000 +0200 +++ ffmpeg/libavcodec/i386/snowdsp_mmx.c 2007-01-28 00:32:55.000000000 +0100 @@ -629,10 +629,9 @@ void ff_snow_vertical_compose97i_mmx(DWT #define snow_inner_add_yblock_sse2_header \ DWTELEM * * dst_array = sb->line + src_y;\ - long tmp;\ + long tmp = b_h;\ asm volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ + "mov %6, %%"REG_c" \n\t"\ "mov %4, %%"REG_S" \n\t"\ "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ "pcmpeqd %%xmm3, %%xmm3 \n\t"\ @@ -689,9 +688,9 @@ void ff_snow_vertical_compose97i_mmx(DWT #define snow_inner_add_yblock_sse2_end_common2\ "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :"+m"(dst8),"+m"(dst_array),"=m"(tmp)\ :\ - "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ + "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)src_stride):\ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); #define snow_inner_add_yblock_sse2_end_8\ @@ -705,7 +704,7 @@ void ff_snow_vertical_compose97i_mmx(DWT #define snow_inner_add_yblock_sse2_end_16\ "add $"PTR_SIZE"*1, %1 \n\t"\ snow_inner_add_yblock_sse2_end_common1\ - "dec %2 \n\t"\ + "sub $1, %2 \n\t"\ snow_inner_add_yblock_sse2_end_common2 static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, @@ -795,10 +794,9 @@ snow_inner_add_yblock_sse2_end_16 #define snow_inner_add_yblock_mmx_header \ DWTELEM * * dst_array = sb->line + src_y;\ - long tmp;\ + long tmp = b_h;\ asm volatile(\ - "mov %7, %%"REG_c" \n\t"\ - "mov %6, %2 \n\t"\ + "mov %6, %%"REG_c" \n\t"\ "mov %4, %%"REG_S" \n\t"\ "pxor %%mm7, %%mm7 \n\t" /* 0 */\ "pcmpeqd %%mm3, %%mm3 \n\t"\ @@ -861,11 +859,11 @@ snow_inner_add_yblock_sse2_end_16 "add %%"REG_c", (%%"REG_a") \n\t"\ "add $"PTR_SIZE"*1, %1 \n\t"\ "add %%"REG_c", %0 \n\t"\ - "dec %2 \n\t"\ + "sub $1, %2 \n\t"\ "jnz 1b \n\t"\ - :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ + :"+m"(dst8),"+m"(dst_array),"=m"(tmp)\ :\ - "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)b_h),"m"((long)src_stride):\ + "rm"((long)(src_x<<2)),"m"(obmc),"a"(block),"m"((long)src_stride):\ "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); static void inner_add_yblock_bw_8_obmc_16_mmx(uint8_t *obmc, const long obmc_stride, uint8_t * * block, int b_w, long b_h, diff -urp ffmpeg-old/libavcodec/liba52/resample_mmx.c ffmpeg/libavcodec/liba52/resample_mmx.c --- ffmpeg-old/libavcodec/liba52/resample_mmx.c 2006-10-17 14:24:22.000000000 +0200 +++ ffmpeg/libavcodec/liba52/resample_mmx.c 2007-01-27 22:43:34.000000000 +0100 @@ -35,10 +35,10 @@ static int a52_resample_MONO_to_5_MMX(fl int32_t * f = (int32_t *) _f; asm volatile( "movl $-512, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" - "movq "MANGLE(wm1100)", %%mm3 \n\t" - "movq "MANGLE(wm0101)", %%mm4 \n\t" - "movq "MANGLE(wm1010)", %%mm5 \n\t" + "movq %2, %%mm7 \n\t" + "movq %3, %%mm3 \n\t" + "movq %4, %%mm4 \n\t" + "movq %5, %%mm5 \n\t" "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq (%1, %%esi, 2), %%mm0 \n\t" @@ -62,7 +62,7 @@ static int a52_resample_MONO_to_5_MMX(fl "addl $8, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1280), "r" (f+256) + :: "r" (s16+1280), "r" (f+256), "m" (magicF2W), "m" (wm1100), "m" (wm0101), "m" (wm1010) :"%esi", "%edi", "memory" ); return 5*256; @@ -90,7 +90,7 @@ static int a52_resample_STEREO_to_2_MMX( );*/ asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" "1: \n\t" "movq (%1, %%esi), %%mm0 \n\t" "movq 8(%1, %%esi), %%mm1 \n\t" @@ -110,7 +110,7 @@ static int a52_resample_STEREO_to_2_MMX( "addl $16, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+512), "r" (f+256) + :: "r" (s16+512), "r" (f+256), "m" (magicF2W) :"%esi", "memory" ); return 2*256; @@ -120,7 +120,7 @@ static int a52_resample_3F_to_5_MMX(floa int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" "pxor %%mm6, %%mm6 \n\t" "movq %%mm7, %%mm5 \n\t" "punpckldq %%mm6, %%mm5 \n\t" @@ -165,7 +165,7 @@ static int a52_resample_3F_to_5_MMX(floa "addl $16, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1280), "r" (f+256) + :: "r" (s16+1280), "r" (f+256), "m" (magicF2W) :"%esi", "%edi", "memory" ); return 5*256; @@ -175,7 +175,7 @@ static int a52_resample_2F_2R_to_4_MMX(f int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" "1: \n\t" "movq (%1, %%esi), %%mm0 \n\t" "movq 8(%1, %%esi), %%mm1 \n\t" @@ -216,7 +216,7 @@ static int a52_resample_2F_2R_to_4_MMX(f "addl $16, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1024), "r" (f+256) + :: "r" (s16+1024), "r" (f+256), "m" (magicF2W) :"%esi", "memory" ); return 4*256; @@ -226,7 +226,7 @@ static int a52_resample_3F_2R_to_5_MMX(f int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" "1: \n\t" "movd (%1, %%esi), %%mm0 \n\t" "punpckldq 2048(%1, %%esi), %%mm0\n\t" @@ -275,7 +275,7 @@ static int a52_resample_3F_2R_to_5_MMX(f "addl $16, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1280), "r" (f+256) + :: "r" (s16+1280), "r" (f+256), "m" (magicF2W) :"%esi", "%edi", "memory" ); return 5*256; @@ -285,7 +285,7 @@ static int a52_resample_MONO_LFE_to_6_MM int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq 1024(%1, %%esi), %%mm0 \n\t" @@ -315,7 +315,7 @@ static int a52_resample_MONO_LFE_to_6_MM "addl $16, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1536), "r" (f+256) + :: "r" (s16+1536), "r" (f+256), "m" (magicF2W) :"%esi", "%edi", "memory" ); return 6*256; @@ -325,7 +325,7 @@ static int a52_resample_STEREO_LFE_to_6_ int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq 1024(%1, %%esi), %%mm0 \n\t" @@ -353,7 +353,7 @@ static int a52_resample_STEREO_LFE_to_6_ "addl $8, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1536), "r" (f+256) + :: "r" (s16+1536), "r" (f+256), "m" (magicF2W) :"%esi", "%edi", "memory" ); return 6*256; @@ -363,7 +363,7 @@ static int a52_resample_3F_LFE_to_6_MMX( int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq 1024(%1, %%esi), %%mm0 \n\t" @@ -393,7 +393,7 @@ static int a52_resample_3F_LFE_to_6_MMX( "addl $8, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1536), "r" (f+256) + :: "r" (s16+1536), "r" (f+256), "m" (magicF2W) :"%esi", "%edi", "memory" ); return 6*256; @@ -403,7 +403,7 @@ static int a52_resample_2F_2R_LFE_to_6_M int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" // "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq 1024(%1, %%esi), %%mm0 \n\t" @@ -439,7 +439,7 @@ static int a52_resample_2F_2R_LFE_to_6_M "addl $8, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1536), "r" (f+256) + :: "r" (s16+1536), "r" (f+256), "m" (magicF2W) :"%esi", "%edi", "memory" ); return 6*256; @@ -449,7 +449,7 @@ static int a52_resample_3F_2R_LFE_to_6_M int32_t * f = (int32_t *) _f; asm volatile( "movl $-1024, %%esi \n\t" - "movq "MANGLE(magicF2W)", %%mm7 \n\t" + "movq %2, %%mm7 \n\t" // "pxor %%mm6, %%mm6 \n\t" "1: \n\t" "movq 1024(%1, %%esi), %%mm0 \n\t" @@ -487,7 +487,7 @@ static int a52_resample_3F_2R_LFE_to_6_M "addl $8, %%esi \n\t" " jnz 1b \n\t" "emms \n\t" - :: "r" (s16+1536), "r" (f+256) + :: "r" (s16+1536), "r" (f+256), "m" (magicF2W) :"%esi", "%edi", "memory" ); return 6*256; diff -urp ffmpeg-old/libpostproc/postprocess_template.c ffmpeg/libpostproc/postprocess_template.c --- ffmpeg-old/libpostproc/postprocess_template.c 2006-10-17 14:24:27.000000000 +0200 +++ ffmpeg/libpostproc/postprocess_template.c 2007-01-28 01:35:32.000000000 +0100 @@ -387,16 +387,16 @@ static inline void RENAME(vertRK1Filter) // FIXME rounding asm volatile( "pxor %%mm7, %%mm7 \n\t" // 0 - "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE + "movq %2, %%mm6 \n\t" // MIN_SIGNED_BYTE "leal (%0, %1), %%"REG_a" \n\t" "leal (%%"REG_a", %1, 4), %%"REG_c" \n\t" // 0 1 2 3 4 5 6 7 8 9 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 - "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP + "movq %3, %%mm0 \n\t" // QP,..., QP "movq %%mm0, %%mm1 \n\t" // QP,..., QP - "paddusb "MANGLE(b02)", %%mm0 \n\t" + "paddusb %4, %%mm0 \n\t" "psrlw $2, %%mm0 \n\t" - "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 + "pand %5, %%mm0 \n\t" // QP/4,..., QP/4 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... "movq (%0, %1, 4), %%mm2 \n\t" // line 4 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 @@ -425,8 +425,8 @@ static inline void RENAME(vertRK1Filter) "paddb %%mm6, %%mm5 \n\t" "psrlw $2, %%mm5 \n\t" - "pand "MANGLE(b3F)", %%mm5 \n\t" - "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 + "pand %5, %%mm5 \n\t" + "psubb %6, %%mm5 \n\t" // (l5-l4)/8 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 @@ -441,7 +441,7 @@ static inline void RENAME(vertRK1Filter) "movq %%mm2, (%%"REG_c", %1) \n\t" : - : "r" (src), "r" ((long)stride) + : "r" (src), "r" ((long)stride), "m" (b80), "m" (pQPb), "m" (b02), "m" (b3F), "m" (b20) : "%"REG_a, "%"REG_c ); #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) @@ -517,7 +517,7 @@ static inline void RENAME(vertX1Filter)( "paddusb %%mm0, %%mm0 \n\t" "psubusb %%mm0, %%mm4 \n\t" "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 - "psubusb "MANGLE(b01)", %%mm3 \n\t" + "psubusb %3, %%mm3 \n\t" "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 PAVGB(%%mm7, %%mm3) // d/2 @@ -566,7 +566,7 @@ static inline void RENAME(vertX1Filter)( "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 : - : "r" (src), "r" ((long)stride), "m" (co->pQPb) + : "r" (src), "r" ((long)stride), "m" (co->pQPb), "m" (b01) : "%"REG_a, "%"REG_c ); #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) @@ -699,17 +699,17 @@ static inline void RENAME(doVertDefFilte PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? - "paddusb "MANGLE(b01)", %%mm4 \n\t" + "paddusb %3, %%mm4 \n\t" "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 "pand %%mm4, %%mm3 \n\t" "movq %%mm3, %%mm1 \n\t" -// "psubusb "MANGLE(b01)", %%mm3 \n\t" +// "psubusb %3, %%mm3 \n\t" PAVGB(%%mm7, %%mm3) PAVGB(%%mm7, %%mm3) "paddusb %%mm1, %%mm3 \n\t" -// "paddusb "MANGLE(b01)", %%mm3 \n\t" +// "paddusb %3, %%mm3 \n\t" "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 "movq (%0, %1, 4), %%mm5 \n\t" //l4 @@ -722,7 +722,7 @@ static inline void RENAME(doVertDefFilte "pand %%mm0, %%mm3 \n\t" PMINUB(%%mm5, %%mm3, %%mm0) - "psubusb "MANGLE(b01)", %%mm3 \n\t" + "psubusb %3, %%mm3 \n\t" PAVGB(%%mm7, %%mm3) "movq (%%"REG_a", %1, 2), %%mm0 \n\t" @@ -754,7 +754,7 @@ static inline void RENAME(doVertDefFilte "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 "pxor %%mm6, %%mm2 \n\t" // -l5-1 "movq %%mm2, %%mm5 \n\t" // -l5-1 - "movq "MANGLE(b80)", %%mm4 \n\t" // 128 + "movq %4, %%mm4 \n\t" // 128 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 @@ -766,7 +766,7 @@ static inline void RENAME(doVertDefFilte "pxor %%mm6, %%mm2 \n\t" // -l1-1 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 PAVGB((%0), %%mm1) // (l0-l3+256)/2 - "movq "MANGLE(b80)", %%mm3 \n\t" // 128 + "movq %4, %%mm3 \n\t" // 128 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 @@ -776,14 +776,14 @@ static inline void RENAME(doVertDefFilte "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 "pxor %%mm6, %%mm1 \n\t" // -l7-1 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 - "movq "MANGLE(b80)", %%mm2 \n\t" // 128 + "movq %4, %%mm2 \n\t" // 128 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 - "movq "MANGLE(b00)", %%mm1 \n\t" // 0 - "movq "MANGLE(b00)", %%mm5 \n\t" // 0 + "movq %5, %%mm1 \n\t" // 0 + "movq %5, %%mm5 \n\t" // 0 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| @@ -792,7 +792,7 @@ static inline void RENAME(doVertDefFilte // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 - "movq "MANGLE(b00)", %%mm7 \n\t" // 0 + "movq %5, %%mm7 \n\t" // 0 "movq %2, %%mm2 \n\t" // QP PAVGB(%%mm6, %%mm2) // 128 + QP/2 "psubb %%mm6, %%mm2 \n\t" @@ -806,13 +806,13 @@ static inline void RENAME(doVertDefFilte // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 "movq %%mm4, %%mm3 \n\t" // d - "psubusb "MANGLE(b01)", %%mm4 \n\t" + "psubusb %3, %%mm4 \n\t" PAVGB(%%mm7, %%mm4) // d/32 PAVGB(%%mm7, %%mm4) // (d + 32)/64 "paddb %%mm3, %%mm4 \n\t" // 5d/64 "pand %%mm2, %%mm4 \n\t" - "movq "MANGLE(b80)", %%mm5 \n\t" // 128 + "movq %4, %%mm5 \n\t" // 128 "psubb %%mm0, %%mm5 \n\t" // q "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) @@ -834,7 +834,7 @@ static inline void RENAME(doVertDefFilte "movq %%mm2, (%0, %1, 4) \n\t" : - : "r" (src), "r" ((long)stride), "m" (c->pQPb) + : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m" (b01), "m" (b80), "m" (b00) : "%"REG_a, "%"REG_c ); @@ -1078,10 +1078,10 @@ src-=8; "psubusw %%mm1, %%mm5 \n\t" // ld - "movq "MANGLE(w05)", %%mm2 \n\t" // 5 + "movq %3, %%mm2 \n\t" // 5 "pmullw %%mm2, %%mm4 \n\t" "pmullw %%mm2, %%mm5 \n\t" - "movq "MANGLE(w20)", %%mm2 \n\t" // 32 + "movq %4, %%mm2 \n\t" // 32 "paddw %%mm2, %%mm4 \n\t" "paddw %%mm2, %%mm5 \n\t" "psrlw $6, %%mm4 \n\t" @@ -1131,7 +1131,7 @@ src-=8; "movq %%mm0, (%0, %1) \n\t" : "+r" (src) - : "r" ((long)stride), "m" (c->pQPb) + : "r" ((long)stride), "m" (c->pQPb), "m" (w05), "m" (w20) : "%"REG_a, "%"REG_c ); #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) @@ -1275,7 +1275,7 @@ FIND_MIN_MAX((%0, %1, 8)) "movq %%mm6, %%mm0 \n\t" // max "psubb %%mm7, %%mm6 \n\t" // max - min "movd %%mm6, %%ecx \n\t" - "cmpb "MANGLE(deringThreshold)", %%cl \n\t" + "cmpb %4, %%cl \n\t" " jb 1f \n\t" "lea -24(%%"REG_SP"), %%"REG_c" \n\t" "and "ALIGN_MASK", %%"REG_c" \n\t" @@ -1302,9 +1302,9 @@ FIND_MIN_MAX((%0, %1, 8)) "psubusb %%mm7, %%mm0 \n\t" "psubusb %%mm7, %%mm2 \n\t" "psubusb %%mm7, %%mm3 \n\t" - "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 - "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 - "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 + "pcmpeqb %5, %%mm0 \n\t" // L10 > a ? 0 : -1 + "pcmpeqb %5, %%mm2 \n\t" // L20 > a ? 0 : -1 + "pcmpeqb %5, %%mm3 \n\t" // L00 > a ? 0 : -1 "paddb %%mm2, %%mm0 \n\t" "paddb %%mm3, %%mm0 \n\t" @@ -1325,9 +1325,9 @@ FIND_MIN_MAX((%0, %1, 8)) "psubusb %%mm7, %%mm2 \n\t" "psubusb %%mm7, %%mm4 \n\t" "psubusb %%mm7, %%mm5 \n\t" - "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 - "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 - "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 + "pcmpeqb %5, %%mm2 \n\t" // L11 > a ? 0 : -1 + "pcmpeqb %5, %%mm4 \n\t" // L21 > a ? 0 : -1 + "pcmpeqb %5, %%mm5 \n\t" // L01 > a ? 0 : -1 "paddb %%mm4, %%mm2 \n\t" "paddb %%mm5, %%mm2 \n\t" // 0, 2, 3, 1 @@ -1352,7 +1352,7 @@ FIND_MIN_MAX((%0, %1, 8)) "psubusb " #lx ", " #t1 " \n\t"\ "psubusb " #lx ", " #t0 " \n\t"\ "psubusb " #lx ", " #sx " \n\t"\ - "movq "MANGLE(b00)", " #lx " \n\t"\ + "movq %5, " #lx " \n\t"\ "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ @@ -1368,8 +1368,8 @@ FIND_MIN_MAX((%0, %1, 8)) PMINUB(t1, pplx, t0)\ "paddb " #sx ", " #ppsx " \n\t"\ "paddb " #psx ", " #ppsx " \n\t"\ - "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ - "pand "MANGLE(b08)", " #ppsx " \n\t"\ + "#paddb %6, " #ppsx " \n\t"\ + "pand %7, " #ppsx " \n\t"\ "pcmpeqb " #lx ", " #ppsx " \n\t"\ "pand " #ppsx ", " #pplx " \n\t"\ "pandn " #dst ", " #ppsx " \n\t"\ @@ -1405,7 +1405,7 @@ DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) "1: \n\t" - : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2) + : : "r" (src), "r" ((long)stride), "m" (c->pQPb), "m"(c->pQPb2), "m" (deringThreshold), "m" (b00), "m" (b02), "m" (b08) : "%"REG_a, "%"REG_d, "%"REG_c ); #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) @@ -2283,7 +2283,7 @@ static inline void RENAME(tempNoiseReduc #else //L1_DIFF #if defined (FAST_L2_DIFF) "pcmpeqb %%mm7, %%mm7 \n\t" - "movq "MANGLE(b80)", %%mm6 \n\t" + "movq %4, %%mm6 \n\t" "pxor %%mm0, %%mm0 \n\t" #define REAL_L2_DIFF_CORE(a, b)\ "movq " #a ", %%mm5 \n\t"\ @@ -2532,7 +2532,7 @@ L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc "4: \n\t" - :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast) + :: "r" (src), "r" (tempBlured), "r"((long)stride), "m" (tempBluredPast), "m" (b80) : "%"REG_a, "%"REG_d, "%"REG_c, "memory" ); #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) @@ -2805,8 +2805,8 @@ asm volatile( "movq %%mm6, %%mm1 \n\t" "psllw $2, %%mm0 \n\t" "psllw $2, %%mm1 \n\t" - "paddw "MANGLE(w04)", %%mm0 \n\t" - "paddw "MANGLE(w04)", %%mm1 \n\t" + "paddw %5, %%mm0 \n\t" + "paddw %5, %%mm1 \n\t" #define NEXT\ "movq (%0), %%mm2 \n\t"\ @@ -2895,7 +2895,7 @@ asm volatile( "mov %4, %0 \n\t" //FIXME : "+&r"(src) - : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src) + : "r" ((long)step), "m" (c->pQPb), "r"(sums), "g"(src), "m" (w04) ); src+= step; // src points to begin of the 8x8 Block @@ -3112,10 +3112,10 @@ asm volatile( "psubusw %%mm1, %%mm5 \n\t" // ld - "movq "MANGLE(w05)", %%mm2 \n\t" // 5 + "movq %4, %%mm2 \n\t" // 5 "pmullw %%mm2, %%mm4 \n\t" "pmullw %%mm2, %%mm5 \n\t" - "movq "MANGLE(w20)", %%mm2 \n\t" // 32 + "movq %5, %%mm2 \n\t" // 32 "paddw %%mm2, %%mm4 \n\t" "paddw %%mm2, %%mm5 \n\t" "psrlw $6, %%mm4 \n\t" @@ -3167,7 +3167,7 @@ asm volatile( "movq %%mm0, (%0, %1) \n\t" : "+r" (temp_src) - : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask) + : "r" ((long)step), "m" (c->pQPb), "m"(eq_mask), "m" (w05), "m" (w20) : "%"REG_a, "%"REG_c ); } @@ -3198,10 +3198,8 @@ static inline void RENAME(blockCopy)(uin { #ifdef HAVE_MMX asm volatile( - "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset - "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale - "lea (%2,%4), %%"REG_a" \n\t" - "lea (%3,%5), %%"REG_d" \n\t" + "movq (%0), %%mm2 \n\t" // packedYOffset + "movq 8(%0), %%mm3 \n\t" // packedYScale "pxor %%mm4, %%mm4 \n\t" #ifdef HAVE_MMX2 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ @@ -3257,22 +3255,24 @@ static inline void RENAME(blockCopy)(uin #define SCALED_CPY(src1, src2, dst1, dst2)\ REAL_SCALED_CPY(src1, src2, dst1, dst2) -SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) -SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) -SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) - "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" - "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" -SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) - - - : "=&a" (packedOffsetAndScale) - : "0" (packedOffsetAndScale), - "r"(src), - "r"(dst), - "r" ((long)srcStride), - "r" ((long)dstStride) - : "%"REG_d - ); +SCALED_CPY((%1), (%1, %3), (%2), (%2, %4)) + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%4,2), %2 \n\t" +SCALED_CPY((%1), (%1, %3), (%2), (%2, %4)) + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%4,2), %2 \n\t" +SCALED_CPY((%1), (%1, %3), (%2), (%2, %4)) + "lea (%1,%3,2), %1 \n\t" + "lea (%2,%4,2), %2 \n\t" +SCALED_CPY((%1), (%1, %3), (%2), (%2, %4)) + + : "+r" (packedOffsetAndScale), + "+r"(src), + "+r"(dst) + : "r" ((long)srcStride), + "r" ((long)dstStride) + : "memory" + ); #else //HAVE_MMX for(i=0; i<8; i++) memcpy( &(dst[dstStride*i]), @@ -3283,8 +3283,6 @@ SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2) { #ifdef HAVE_MMX asm volatile( - "lea (%0,%2), %%"REG_a" \n\t" - "lea (%1,%3), %%"REG_d" \n\t" #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ "movq " #src1 ", %%mm0 \n\t"\ @@ -3295,18 +3293,22 @@ SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2) #define SIMPLE_CPY(src1, src2, dst1, dst2)\ REAL_SIMPLE_CPY(src1, src2, dst1, dst2) -SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) -SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) -SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) - "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" - "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" -SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) - - : : "r" (src), - "r" (dst), - "r" ((long)srcStride), - "r" ((long)dstStride) - : "%"REG_a, "%"REG_d +SIMPLE_CPY((%0), (%0, %2), (%1), (%1, %3)) + "lea (%0,%2,2), %0 \n\t" + "lea (%1,%3,2), %1 \n\t" +SIMPLE_CPY((%0), (%0, %2), (%1), (%1, %3)) + "lea (%0,%2,2), %0 \n\t" + "lea (%1,%3,2), %1 \n\t" +SIMPLE_CPY((%0), (%0, %2), (%1), (%1, %3)) + "lea (%0,%2), %0 \n\t" + "lea (%1,%3), %1 \n\t" +SIMPLE_CPY((%0), (%0, %2), (%1), (%1, %3)) + + : "+r" (src), + "+r" (dst) + : "r" ((long)srcStride), + "r" ((long)dstStride) + : "memory" ); #else //HAVE_MMX for(i=0; i<8; i++)