diff -urp libtheora-1.0beta2-orig/lib/enc/x86_32/dct_decode_mmx.c libtheora-1.0beta2/lib/enc/x86_32/dct_decode_mmx.c --- libtheora-1.0beta2-orig/lib/enc/x86_32/dct_decode_mmx.c 2007-10-04 20:37:01.000000000 +0200 +++ libtheora-1.0beta2/lib/enc/x86_32/dct_decode_mmx.c 2007-12-17 10:32:44.000000000 +0100 @@ -57,9 +57,9 @@ static void FilterHoriz__mmx(unsigned ch "psubw %%mm3,%%mm1\n" /* mm1 = pix[0]-pix[3] mm1 - mm3 */ \ "movq %%mm0,%%mm7\n" /* mm7 = pix[2]*/ \ "psubw %%mm5,%%mm0\n" /* mm0 = pix[2]-pix[1] mm0 - mm5*/ \ - "PMULLW "MANGLE(V3)",%%mm0\n" /* *3 */ \ + "PMULLW %3,%%mm0\n" /* *3 */ \ "paddw %%mm0,%%mm1\n" /* mm1 has f[0] ... f[4]*/ \ - "paddw "MANGLE(V804)",%%mm1\n"/* add 4 */ /* add 256 after shift */ \ + "paddw %4,%%mm1\n"/* add 4 */ /* add 256 after shift */ \ "psraw $3,%%mm1\n" /* >>3 */ \ " pextrw $0,%%mm1,%%esi\n" /* In MM1 we have 4 f coefs (16bits) */ \ " pextrw $1,%%mm1,%%edi\n" /* now perform MM4 = *(_bv+ f) */ \ @@ -75,20 +75,19 @@ static void FilterHoriz__mmx(unsigned ch " packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \ " packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */ \ " punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \ - " movd %%mm5,%%eax\n" /* eax = newpix21 */ \ - " movw %%ax,1(%0)\n" \ + " movd %%mm5,%%edi\n" /* eax = newpix21 */ \ + " movw %%di,1(%0)\n" \ " psrlq $32,%%mm5\n" /* why is so big stall here ? */ \ - " shrl $16,%%eax\n" \ - " lea 1(%0,%1,2),%%edi\n" \ - " movw %%ax,1(%0,%1,1)\n" \ - " movd %%mm5,%%eax\n" /* eax = newpix21 high part */ \ + " shrl $16,%%edi\n" \ + " movw %%di,1(%0,%1,1)\n" \ + " movd %%mm5,%%edi\n" /* eax = newpix21 high part */ \ " lea (%1,%1,2),%%esi\n" \ - " movw %%ax,(%%edi)\n" \ - " shrl $16,%%eax\n" \ - " movw %%ax,1(%0,%%esi)\n" \ + " movw %%di,1(%0,%1,2)\n" \ + " shrl $16,%%edi\n" \ + " movw %%di,1(%0,%%esi)\n" \ : \ - : "r" (PixelPtr), "r" (LineLength), "r" (BoundingValuePtr-256) \ - : "esi", "edi" , "memory", "eax" \ + : "r" (PixelPtr), "r" (LineLength), "r" (BoundingValuePtr-256), "m" (V3), "m" (V804) \ + : "esi", "edi" , "memory" \ ); OC_LOOP_H_4x4 @@ -126,12 +125,12 @@ static void FilterVert__mmx(unsigned cha "psubw %%mm5,%%mm3\n" "psubw %%mm4,%%mm2\n" /* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */ - "PMULLW "MANGLE(V3)",%%mm3\n" /* *3 */ - "PMULLW "MANGLE(V3)",%%mm2\n" /* *3 */ + "PMULLW %3,%%mm3\n" /* *3 */ + "PMULLW %3,%%mm2\n" /* *3 */ "paddw %%mm7,%%mm3\n" /* highpart */ "paddw %%mm6,%%mm2\n" /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]); */ - "paddw "MANGLE(V804)",%%mm3\n" /* add 4 */ /* add 256 after shift */ - "paddw "MANGLE(V804)",%%mm2\n" /* add 4 */ /* add 256 after shift */ + "paddw %4,%%mm3\n" /* add 4 */ /* add 256 after shift */ + "paddw %4,%%mm2\n" /* add 4 */ /* add 256 after shift */ "psraw $3,%%mm3\n" /* >>3 f coefs high */ "psraw $3,%%mm2\n" /* >>3 f coefs low */ @@ -168,7 +167,7 @@ static void FilterVert__mmx(unsigned cha "movq %%mm4,(%0,%1)\n" /* pix[ystride]= */ "emms\n" : - : "r" (PixelPtr-2*LineLength), "r" (LineLength), "r" (BoundingValuePtr-256) + : "r" (PixelPtr-2*LineLength), "r" (LineLength), "r" (BoundingValuePtr-256), "m" (V3), "m" (V804) : "esi", "edi" , "memory" ); }