--- libavcodec/x86/h264dsp_mmx.c.ori 2010-04-01 18:52:14.000000000 +0200 +++ libavcodec/x86/h264dsp_mmx.c 2010-05-26 21:18:33.876237887 +0200 @@ -1138,7 +1138,7 @@ int h=8;\ __asm__ volatile(\ "pxor %%mm7, %%mm7 \n\t"\ - "movq %5, %%mm6 \n\t"\ + "movq "MANGLE(ff_pw_5) ", %%mm6\n\t"\ "1: \n\t"\ "movq (%0), %%mm0 \n\t"\ "movq 1(%0), %%mm2 \n\t"\ @@ -1172,7 +1172,7 @@ "punpcklbw %%mm7, %%mm5 \n\t"\ "paddw %%mm3, %%mm2 \n\t"\ "paddw %%mm5, %%mm4 \n\t"\ - "movq %6, %%mm5 \n\t"\ + "movq "MANGLE(ff_pw_16) ", %%mm5\n\t"\ "paddw %%mm5, %%mm2 \n\t"\ "paddw %%mm5, %%mm4 \n\t"\ "paddw %%mm2, %%mm0 \n\t"\ @@ -1186,7 +1186,7 @@ "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(src), "+c"(dst), "+g"(h)\ - : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\ + : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ : "memory"\ );\ }\ @@ -1593,8 +1593,8 @@ int h=8;\ __asm__ volatile(\ "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa %0, %%xmm6 \n\t"\ - :: "m"(ff_pw_5)\ + "movdqa "MANGLE(ff_pw_5) ", %%xmm6 \n\t"\ + ::\ );\ do{\ __asm__ volatile(\ @@ -1617,7 +1617,7 @@ "psllw $2, %%xmm2 \n\t"\ "movq (%2), %%xmm3 \n\t"\ "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw %5, %%xmm0 \n\t"\ + "paddw "MANGLE(ff_pw_16)", %%xmm0 \n\t"\ "pmullw %%xmm6, %%xmm2 \n\t"\ "paddw %%xmm0, %%xmm2 \n\t"\ "psraw $5, %%xmm2 \n\t"\ @@ -1628,8 +1628,7 @@ "add %4, %1 \n\t"\ "add %3, %2 \n\t"\ : "+a"(src), "+c"(dst), "+d"(src2)\ - : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_16)\ + : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\ : "memory"\ );\ }while(--h);\ @@ -1640,7 +1639,7 @@ int h=8;\ __asm__ volatile(\ "pxor %%xmm7, %%xmm7 \n\t"\ - "movdqa %5, %%xmm6 \n\t"\ + "movdqa "MANGLE(ff_pw_5)", %%xmm6\n\t"\ "1: \n\t"\ "lddqu -2(%0), %%xmm1 \n\t"\ "movdqa %%xmm1, %%xmm0 \n\t"\ @@ -1660,7 +1659,7 @@ "paddw %%xmm4, %%xmm1 \n\t"\ "psllw $2, %%xmm2 \n\t"\ "psubw %%xmm1, %%xmm2 \n\t"\ - "paddw %6, %%xmm0 \n\t"\ + "paddw "MANGLE(ff_pw_16)", %%xmm0 \n\t"\ "pmullw %%xmm6, %%xmm2 \n\t"\ "paddw %%xmm0, %%xmm2 \n\t"\ "psraw $5, %%xmm2 \n\t"\ @@ -1671,8 +1670,7 @@ "decl %2 \n\t"\ " jnz 1b \n\t"\ : "+a"(src), "+c"(dst), "+g"(h)\ - : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\ - "m"(ff_pw_5), "m"(ff_pw_16)\ + : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride)\ : "memory"\ );\ }\ --- libavcodec/x86/dsputil_mmx.c.ori 2010-03-16 02:17:00.000000000 +0100 +++ libavcodec/x86/dsputil_mmx.c 2010-05-26 20:47:45.957265915 +0200 @@ -724,11 +724,16 @@ } static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){ + uint32_t l1 = *(uint32_t*)(src + 0*src_stride); + uint32_t l2 = *(uint32_t*)(src + 1*src_stride); + uint32_t l3 = *(uint32_t*)(src + 2*src_stride); + uint32_t l4 = *(uint32_t*)(src + 3*src_stride); + __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ... - "movd %4, %%mm0 \n\t" - "movd %5, %%mm1 \n\t" - "movd %6, %%mm2 \n\t" - "movd %7, %%mm3 \n\t" + "movd %0, %%mm0 \n\t" + "movd %1, %%mm1 \n\t" + "movd %2, %%mm2 \n\t" + "movd %3, %%mm3 \n\t" "punpcklbw %%mm1, %%mm0 \n\t" "punpcklbw %%mm3, %%mm2 \n\t" "movq %%mm0, %%mm1 \n\t" @@ -741,15 +746,16 @@ "punpckhdq %%mm1, %%mm1 \n\t" "movd %%mm1, %3 \n\t" - : "=m" (*(uint32_t*)(dst + 0*dst_stride)), - "=m" (*(uint32_t*)(dst + 1*dst_stride)), - "=m" (*(uint32_t*)(dst + 2*dst_stride)), - "=m" (*(uint32_t*)(dst + 3*dst_stride)) - : "m" (*(uint32_t*)(src + 0*src_stride)), - "m" (*(uint32_t*)(src + 1*src_stride)), - "m" (*(uint32_t*)(src + 2*src_stride)), - "m" (*(uint32_t*)(src + 3*src_stride)) + : "+m" (l1), + "+m" (l2), + "+m" (l3), + "+m" (l4) ); + + *(uint32_t*)(dst + 0*dst_stride) = l1; + *(uint32_t*)(dst + 1*dst_stride) = l2; + *(uint32_t*)(dst + 2*dst_stride) = l3; + *(uint32_t*)(dst + 3*dst_stride) = l4; } static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){