Lines 57-65
static void FilterHoriz__mmx(unsigned ch
Link Here
|
57 |
"psubw %%mm3,%%mm1\n" /* mm1 = pix[0]-pix[3] mm1 - mm3 */ \ |
57 |
"psubw %%mm3,%%mm1\n" /* mm1 = pix[0]-pix[3] mm1 - mm3 */ \ |
58 |
"movq %%mm0,%%mm7\n" /* mm7 = pix[2]*/ \ |
58 |
"movq %%mm0,%%mm7\n" /* mm7 = pix[2]*/ \ |
59 |
"psubw %%mm5,%%mm0\n" /* mm0 = pix[2]-pix[1] mm0 - mm5*/ \ |
59 |
"psubw %%mm5,%%mm0\n" /* mm0 = pix[2]-pix[1] mm0 - mm5*/ \ |
60 |
"PMULLW "MANGLE(V3)",%%mm0\n" /* *3 */ \ |
60 |
"PMULLW %3,%%mm0\n" /* *3 */ \ |
61 |
"paddw %%mm0,%%mm1\n" /* mm1 has f[0] ... f[4]*/ \ |
61 |
"paddw %%mm0,%%mm1\n" /* mm1 has f[0] ... f[4]*/ \ |
62 |
"paddw "MANGLE(V804)",%%mm1\n"/* add 4 */ /* add 256 after shift */ \ |
62 |
"paddw %4,%%mm1\n"/* add 4 */ /* add 256 after shift */ \ |
63 |
"psraw $3,%%mm1\n" /* >>3 */ \ |
63 |
"psraw $3,%%mm1\n" /* >>3 */ \ |
64 |
" pextrw $0,%%mm1,%%esi\n" /* In MM1 we have 4 f coefs (16bits) */ \ |
64 |
" pextrw $0,%%mm1,%%esi\n" /* In MM1 we have 4 f coefs (16bits) */ \ |
65 |
" pextrw $1,%%mm1,%%edi\n" /* now perform MM4 = *(_bv+ f) */ \ |
65 |
" pextrw $1,%%mm1,%%edi\n" /* now perform MM4 = *(_bv+ f) */ \ |
Lines 75-94
static void FilterHoriz__mmx(unsigned ch
Link Here
|
75 |
" packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \ |
75 |
" packuswb %%mm0,%%mm5\n" /* mm5 = x x x x newpix1 */ \ |
76 |
" packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */ \ |
76 |
" packuswb %%mm0,%%mm7\n" /* mm7 = x x x x newpix2 */ \ |
77 |
" punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \ |
77 |
" punpcklbw %%mm7,%%mm5\n" /* 2 1 2 1 2 1 2 1 */ \ |
78 |
" movd %%mm5,%%eax\n" /* eax = newpix21 */ \ |
78 |
" movd %%mm5,%%edi\n" /* eax = newpix21 */ \ |
79 |
" movw %%ax,1(%0)\n" \ |
79 |
" movw %%di,1(%0)\n" \ |
80 |
" psrlq $32,%%mm5\n" /* why is so big stall here ? */ \ |
80 |
" psrlq $32,%%mm5\n" /* why is so big stall here ? */ \ |
81 |
" shrl $16,%%eax\n" \ |
81 |
" shrl $16,%%edi\n" \ |
82 |
" lea 1(%0,%1,2),%%edi\n" \ |
82 |
" movw %%di,1(%0,%1,1)\n" \ |
83 |
" movw %%ax,1(%0,%1,1)\n" \ |
83 |
" movd %%mm5,%%edi\n" /* eax = newpix21 high part */ \ |
84 |
" movd %%mm5,%%eax\n" /* eax = newpix21 high part */ \ |
|
|
85 |
" lea (%1,%1,2),%%esi\n" \ |
84 |
" lea (%1,%1,2),%%esi\n" \ |
86 |
" movw %%ax,(%%edi)\n" \ |
85 |
" movw %%di,1(%0,%1,2)\n" \ |
87 |
" shrl $16,%%eax\n" \ |
86 |
" shrl $16,%%edi\n" \ |
88 |
" movw %%ax,1(%0,%%esi)\n" \ |
87 |
" movw %%di,1(%0,%%esi)\n" \ |
89 |
: \ |
88 |
: \ |
90 |
: "r" (PixelPtr), "r" (LineLength), "r" (BoundingValuePtr-256) \ |
89 |
: "r" (PixelPtr), "r" (LineLength), "r" (BoundingValuePtr-256), "m" (V3), "m" (V804) \ |
91 |
: "esi", "edi" , "memory", "eax" \ |
90 |
: "esi", "edi" , "memory" \ |
92 |
); |
91 |
); |
93 |
|
92 |
|
94 |
OC_LOOP_H_4x4 |
93 |
OC_LOOP_H_4x4 |
Lines 126-137
static void FilterVert__mmx(unsigned cha
Link Here
|
126 |
"psubw %%mm5,%%mm3\n" |
125 |
"psubw %%mm5,%%mm3\n" |
127 |
"psubw %%mm4,%%mm2\n" |
126 |
"psubw %%mm4,%%mm2\n" |
128 |
/* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */ |
127 |
/* mm3:mm2 = (pix[ystride*2]-pix[ystride]); */ |
129 |
"PMULLW "MANGLE(V3)",%%mm3\n" /* *3 */ |
128 |
"PMULLW %3,%%mm3\n" /* *3 */ |
130 |
"PMULLW "MANGLE(V3)",%%mm2\n" /* *3 */ |
129 |
"PMULLW %3,%%mm2\n" /* *3 */ |
131 |
"paddw %%mm7,%%mm3\n" /* highpart */ |
130 |
"paddw %%mm7,%%mm3\n" /* highpart */ |
132 |
"paddw %%mm6,%%mm2\n" /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]); */ |
131 |
"paddw %%mm6,%%mm2\n" /* lowpart of pix[0]-pix[ystride*3]+3*(pix[ystride*2]-pix[ystride]); */ |
133 |
"paddw "MANGLE(V804)",%%mm3\n" /* add 4 */ /* add 256 after shift */ |
132 |
"paddw %4,%%mm3\n" /* add 4 */ /* add 256 after shift */ |
134 |
"paddw "MANGLE(V804)",%%mm2\n" /* add 4 */ /* add 256 after shift */ |
133 |
"paddw %4,%%mm2\n" /* add 4 */ /* add 256 after shift */ |
135 |
"psraw $3,%%mm3\n" /* >>3 f coefs high */ |
134 |
"psraw $3,%%mm3\n" /* >>3 f coefs high */ |
136 |
"psraw $3,%%mm2\n" /* >>3 f coefs low */ |
135 |
"psraw $3,%%mm2\n" /* >>3 f coefs low */ |
137 |
|
136 |
|
Lines 168-174
static void FilterVert__mmx(unsigned cha
Link Here
|
168 |
"movq %%mm4,(%0,%1)\n" /* pix[ystride]= */ |
167 |
"movq %%mm4,(%0,%1)\n" /* pix[ystride]= */ |
169 |
"emms\n" |
168 |
"emms\n" |
170 |
: |
169 |
: |
171 |
: "r" (PixelPtr-2*LineLength), "r" (LineLength), "r" (BoundingValuePtr-256) |
170 |
: "r" (PixelPtr-2*LineLength), "r" (LineLength), "r" (BoundingValuePtr-256), "m" (V3), "m" (V804) |
172 |
: "esi", "edi" , "memory" |
171 |
: "esi", "edi" , "memory" |
173 |
); |
172 |
); |
174 |
} |
173 |
} |