Lines 79-95
Link Here
|
79 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
79 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
80 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
80 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
81 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
81 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
82 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
82 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
83 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
83 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
84 |
".L1010: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
84 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
85 |
"paddusb (%%ebx), %%mm1 \n\t" /* mm1=Src1+Src2 (add 8 bytes with saturation) */ |
85 |
"paddusb (%%ebx), %%mm1 \n\t" /* mm1=Src1+Src2 (add 8 bytes with saturation) */ |
86 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
86 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
87 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
87 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
88 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
88 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
89 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
89 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
90 |
"jnz .L1010 \n\t" /* check loop termination, proceed if required */ |
90 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
91 |
"emms \n\t" /* exit MMX state */ |
91 |
"emms \n\t" /* exit MMX state */ |
92 |
"popa \n\t":"=m" (Dest) /* %0 */ |
92 |
"popa \n\t":"=m" (Dest) /* %0 */ |
93 |
:"m"(Src2), /* %1 */ |
93 |
:"m"(Src2), /* %1 */ |
94 |
"m"(Src1), /* %2 */ |
94 |
"m"(Src1), /* %2 */ |
95 |
"m"(length) /* %3 */ |
95 |
"m"(length) /* %3 */ |
Lines 156-166
Link Here
|
156 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
156 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
157 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
157 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
158 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
158 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
159 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
159 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
160 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
160 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
161 |
".L21011: \n\t" |
161 |
"1: \n\t" |
162 |
"movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
162 |
"movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
163 |
"movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */ |
163 |
"movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */ |
164 |
/* --- Byte shift via Word shift --- */ |
164 |
/* --- Byte shift via Word shift --- */ |
165 |
"psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of mm1 1 bit to the right */ |
165 |
"psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of mm1 1 bit to the right */ |
166 |
"psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of mm2 1 bit to the right */ |
166 |
"psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of mm2 1 bit to the right */ |
Lines 172-182
Link Here
|
172 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
172 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
173 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
173 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
174 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
174 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
175 |
"add $8, %%edi \n\t" |
175 |
"add $8, %%edi \n\t" |
176 |
"dec %%ecx \n\t" /* decrease loop counter */ |
176 |
"dec %%ecx \n\t" /* decrease loop counter */ |
177 |
"jnz .L21011 \n\t" /* check loop termination, proceed if required */ |
177 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
178 |
"emms \n\t" /* exit MMX state */ |
178 |
"emms \n\t" /* exit MMX state */ |
179 |
"popa \n\t":"=m" (Dest) /* %0 */ |
179 |
"popa \n\t":"=m" (Dest) /* %0 */ |
180 |
:"m"(Src2), /* %1 */ |
180 |
:"m"(Src2), /* %1 */ |
181 |
"m"(Src1), /* %2 */ |
181 |
"m"(Src1), /* %2 */ |
182 |
"m"(length), /* %3 */ |
182 |
"m"(length), /* %3 */ |
Lines 239-255
Link Here
|
239 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
239 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
240 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
240 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
241 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
241 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
242 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
242 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
243 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
243 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
244 |
".L1012: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
244 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
245 |
"psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ |
245 |
"psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ |
246 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
246 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
247 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
247 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
248 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
248 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
249 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
249 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
250 |
"jnz .L1012 \n\t" /* check loop termination, proceed if required */ |
250 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
251 |
"emms \n\t" /* exit MMX state */ |
251 |
"emms \n\t" /* exit MMX state */ |
252 |
"popa \n\t":"=m" (Dest) /* %0 */ |
252 |
"popa \n\t":"=m" (Dest) /* %0 */ |
253 |
:"m"(Src2), /* %1 */ |
253 |
:"m"(Src2), /* %1 */ |
254 |
"m"(Src1), /* %2 */ |
254 |
"m"(Src1), /* %2 */ |
255 |
"m"(length) /* %3 */ |
255 |
"m"(length) /* %3 */ |
Lines 311-330
Link Here
|
311 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
311 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
312 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
312 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
313 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
313 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
314 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
314 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
315 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
315 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
316 |
".L1013: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
316 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
317 |
"movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */ |
317 |
"movq (%%ebx), %%mm2 \n\t" /* load 8 bytes from Src2 into mm2 */ |
318 |
"psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ |
318 |
"psubusb (%%ebx), %%mm1 \n\t" /* mm1=Src1-Src2 (sub 8 bytes with saturation) */ |
319 |
"psubusb (%%eax), %%mm2 \n\t" /* mm2=Src2-Src1 (sub 8 bytes with saturation) */ |
319 |
"psubusb (%%eax), %%mm2 \n\t" /* mm2=Src2-Src1 (sub 8 bytes with saturation) */ |
320 |
"por %%mm2, %%mm1 \n\t" /* combine both mm2 and mm1 results */ |
320 |
"por %%mm2, %%mm1 \n\t" /* combine both mm2 and mm1 results */ |
321 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
321 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
322 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
322 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
323 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
323 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
324 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
324 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
325 |
"jnz .L1013 \n\t" /* check loop termination, proceed if required */ |
325 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
326 |
"emms \n\t" /* exit MMX state */ |
326 |
"emms \n\t" /* exit MMX state */ |
327 |
"popa \n\t":"=m" (Dest) /* %0 */ |
327 |
"popa \n\t":"=m" (Dest) /* %0 */ |
328 |
:"m"(Src2), /* %1 */ |
328 |
:"m"(Src2), /* %1 */ |
329 |
"m"(Src1), /* %2 */ |
329 |
"m"(Src1), /* %2 */ |
330 |
"m"(length) /* %3 */ |
330 |
"m"(length) /* %3 */ |
Lines 386-396
Link Here
|
386 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
386 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
387 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
387 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
388 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
388 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
389 |
"pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ |
389 |
"pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ |
390 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
390 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
391 |
".L1014: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
391 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
392 |
"movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ |
392 |
"movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ |
393 |
"movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ |
393 |
"movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ |
394 |
"movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ |
394 |
"movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ |
395 |
"punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ |
395 |
"punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ |
396 |
"punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ |
396 |
"punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ |
Lines 410-420
Link Here
|
410 |
"packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ |
410 |
"packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ |
411 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
411 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
412 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
412 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
413 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
413 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
414 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
414 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
415 |
"jnz .L1014 \n\t" /* check loop termination, proceed if required */ |
415 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
416 |
"emms \n\t" /* exit MMX state */ |
416 |
"emms \n\t" /* exit MMX state */ |
417 |
"popa \n\t":"=m" (Dest) /* %0 */ |
417 |
"popa \n\t":"=m" (Dest) /* %0 */ |
418 |
:"m"(Src2), /* %1 */ |
418 |
:"m"(Src2), /* %1 */ |
419 |
"m"(Src1), /* %2 */ |
419 |
"m"(Src1), /* %2 */ |
420 |
"m"(length) /* %3 */ |
420 |
"m"(length) /* %3 */ |
Lines 479-495
Link Here
|
479 |
("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */ |
479 |
("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */ |
480 |
"mov %1, %%esi \n\t" /* load Src2 address into esi */ |
480 |
"mov %1, %%esi \n\t" /* load Src2 address into esi */ |
481 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
481 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
482 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
482 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
483 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
483 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
484 |
".L10141: \n\t" "mov (%%edx), %%al \n\t" /* load a byte from Src1 */ |
484 |
"1:mov (%%edx), %%al \n\t" /* load a byte from Src1 */ |
485 |
"mulb (%%esi) \n\t" /* mul with a byte from Src2 */ |
485 |
"mulb (%%esi) \n\t" /* mul with a byte from Src2 */ |
486 |
".L10142: \n\t" "mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ |
486 |
"mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ |
487 |
"inc %%edx \n\t" /* increment Src1, Src2, Dest */ |
487 |
"inc %%edx \n\t" /* increment Src1, Src2, Dest */ |
488 |
"inc %%esi \n\t" /* pointer registers by one */ |
488 |
"inc %%esi \n\t" /* pointer registers by one */ |
489 |
"inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
489 |
"inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
490 |
"jnz .L10141 \n\t" /* check loop termination, proceed if required */ |
490 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
491 |
"popa \n\t":"=m" (Dest) /* %0 */ |
491 |
"popa \n\t":"=m" (Dest) /* %0 */ |
492 |
:"m"(Src2), /* %1 */ |
492 |
:"m"(Src2), /* %1 */ |
493 |
"m"(Src1), /* %2 */ |
493 |
"m"(Src1), /* %2 */ |
494 |
"m"(length) /* %3 */ |
494 |
"m"(length) /* %3 */ |
495 |
); |
495 |
); |
Lines 555-565
Link Here
|
555 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
555 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
556 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
556 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
557 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
557 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
558 |
"pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ |
558 |
"pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ |
559 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
559 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
560 |
".L1015: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
560 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
561 |
"movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ |
561 |
"movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ |
562 |
"movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ |
562 |
"movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ |
563 |
"movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ |
563 |
"movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ |
564 |
"punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ |
564 |
"punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ |
565 |
"punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ |
565 |
"punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ |
Lines 572-582
Link Here
|
572 |
"packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ |
572 |
"packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ |
573 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
573 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
574 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
574 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
575 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
575 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
576 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
576 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
577 |
"jnz .L1015 \n\t" /* check loop termination, proceed if required */ |
577 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
578 |
"emms \n\t" /* exit MMX state */ |
578 |
"emms \n\t" /* exit MMX state */ |
579 |
"popa \n\t":"=m" (Dest) /* %0 */ |
579 |
"popa \n\t":"=m" (Dest) /* %0 */ |
580 |
:"m"(Src2), /* %1 */ |
580 |
:"m"(Src2), /* %1 */ |
581 |
"m"(Src1), /* %2 */ |
581 |
"m"(Src1), /* %2 */ |
582 |
"m"(length) /* %3 */ |
582 |
"m"(length) /* %3 */ |
Lines 640-650
Link Here
|
640 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
640 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
641 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
641 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
642 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
642 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
643 |
"pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ |
643 |
"pxor %%mm0, %%mm0 \n\t" /* zero mm0 register */ |
644 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
644 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
645 |
".L1016: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
645 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
646 |
"movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ |
646 |
"movq (%%ebx), %%mm3 \n\t" /* load 8 bytes from Src2 into mm3 */ |
647 |
"movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ |
647 |
"movq %%mm1, %%mm2 \n\t" /* copy mm1 into mm2 */ |
648 |
"movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ |
648 |
"movq %%mm3, %%mm4 \n\t" /* copy mm3 into mm4 */ |
649 |
"punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ |
649 |
"punpcklbw %%mm0, %%mm1 \n\t" /* unpack low bytes of Src1 into words */ |
650 |
"punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ |
650 |
"punpckhbw %%mm0, %%mm2 \n\t" /* unpack high bytes of Src1 into words */ |
Lines 659-669
Link Here
|
659 |
"packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ |
659 |
"packuswb %%mm2, %%mm1 \n\t" /* pack words back into bytes with saturation */ |
660 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
660 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
661 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
661 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
662 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
662 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
663 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
663 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
664 |
"jnz .L1016 \n\t" /* check loop termination, proceed if required */ |
664 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
665 |
"emms \n\t" /* exit MMX state */ |
665 |
"emms \n\t" /* exit MMX state */ |
666 |
"popa \n\t":"=m" (Dest) /* %0 */ |
666 |
"popa \n\t":"=m" (Dest) /* %0 */ |
667 |
:"m"(Src2), /* %1 */ |
667 |
:"m"(Src2), /* %1 */ |
668 |
"m"(Src1), /* %2 */ |
668 |
"m"(Src1), /* %2 */ |
669 |
"m"(length) /* %3 */ |
669 |
"m"(length) /* %3 */ |
Lines 726-742
Link Here
|
726 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
726 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
727 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
727 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
728 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
728 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
729 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
729 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
730 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
730 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
731 |
".L1017: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
731 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
732 |
"pand (%%ebx), %%mm1 \n\t" /* mm1=Src1&Src2 */ |
732 |
"pand (%%ebx), %%mm1 \n\t" /* mm1=Src1&Src2 */ |
733 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
733 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
734 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
734 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
735 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
735 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
736 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
736 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
737 |
"jnz .L1017 \n\t" /* check loop termination, proceed if required */ |
737 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
738 |
"emms \n\t" /* exit MMX state */ |
738 |
"emms \n\t" /* exit MMX state */ |
739 |
"popa \n\t":"=m" (Dest) /* %0 */ |
739 |
"popa \n\t":"=m" (Dest) /* %0 */ |
740 |
:"m"(Src2), /* %1 */ |
740 |
:"m"(Src2), /* %1 */ |
741 |
"m"(Src1), /* %2 */ |
741 |
"m"(Src1), /* %2 */ |
742 |
"m"(length) /* %3 */ |
742 |
"m"(length) /* %3 */ |
Lines 798-814
Link Here
|
798 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
798 |
"mov %1, %%ebx \n\t" /* load Src2 address into ebx */ |
799 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
799 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
800 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
800 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
801 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
801 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
802 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
802 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
803 |
".L91017: \n\t" "movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
803 |
"1: movq (%%eax), %%mm1 \n\t" /* load 8 bytes from Src1 into mm1 */ |
804 |
"por (%%ebx), %%mm1 \n\t" /* mm1=Src1|Src2 */ |
804 |
"por (%%ebx), %%mm1 \n\t" /* mm1=Src1|Src2 */ |
805 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
805 |
"movq %%mm1, (%%edi) \n\t" /* store result in Dest */ |
806 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
806 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
807 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
807 |
"add $8, %%ebx \n\t" /* register pointers by 8 */ |
808 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
808 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
809 |
"jnz .L91017 \n\t" /* check loop termination, proceed if required */ |
809 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
810 |
"emms \n\t" /* exit MMX state */ |
810 |
"emms \n\t" /* exit MMX state */ |
811 |
"popa \n\t":"=m" (Dest) /* %0 */ |
811 |
"popa \n\t":"=m" (Dest) /* %0 */ |
812 |
:"m"(Src2), /* %1 */ |
812 |
:"m"(Src2), /* %1 */ |
813 |
"m"(Src1), /* %2 */ |
813 |
"m"(Src1), /* %2 */ |
814 |
"m"(length) /* %3 */ |
814 |
"m"(length) /* %3 */ |
Lines 866-886
Link Here
|
866 |
("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */ |
866 |
("pusha \n\t" "mov %2, %%edx \n\t" /* load Src1 address into edx */ |
867 |
"mov %1, %%esi \n\t" /* load Src2 address into esi */ |
867 |
"mov %1, %%esi \n\t" /* load Src2 address into esi */ |
868 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
868 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
869 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
869 |
"mov %3, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
870 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
870 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
871 |
".L10191: \n\t" "mov (%%esi), %%bl \n\t" /* load a byte from Src2 */ |
871 |
"1: mov (%%esi), %%bl \n\t" /* load a byte from Src2 */ |
872 |
"cmp $0, %%bl \n\t" /* check if it zero */ |
872 |
"cmp $0, %%bl \n\t" /* check if it zero */ |
873 |
"jnz .L10192 \n\t" "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */ |
873 |
"jnz 2f \n\t" "movb $255, (%%edi) \n\t" /* division by zero = 255 !!! */ |
874 |
"jmp .L10193 \n\t" ".L10192: \n\t" "xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */ |
874 |
"jmp 3f \n\t" "2: \n\t" "xor %%ah, %%ah \n\t" /* prepare AX, zero AH register */ |
875 |
"mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */ |
875 |
"mov (%%edx), %%al \n\t" /* load a byte from Src1 into AL */ |
876 |
"div %%bl \n\t" /* divide AL by BL */ |
876 |
"div %%bl \n\t" /* divide AL by BL */ |
877 |
"mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ |
877 |
"mov %%al, (%%edi) \n\t" /* move a byte result to Dest */ |
878 |
".L10193: \n\t" "inc %%edx \n\t" /* increment Src1, Src2, Dest */ |
878 |
"3: inc %%edx \n\t" /* increment Src1, Src2, Dest */ |
879 |
"inc %%esi \n\t" /* pointer registers by one */ |
879 |
"inc %%esi \n\t" /* pointer registers by one */ |
880 |
"inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
880 |
"inc %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
881 |
"jnz .L10191 \n\t" /* check loop termination, proceed if required */ |
881 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
882 |
"popa \n\t":"=m" (Dest) /* %0 */ |
882 |
"popa \n\t":"=m" (Dest) /* %0 */ |
883 |
:"m"(Src2), /* %1 */ |
883 |
:"m"(Src2), /* %1 */ |
884 |
"m"(Src1), /* %2 */ |
884 |
"m"(Src1), /* %2 */ |
885 |
"m"(length) /* %3 */ |
885 |
"m"(length) /* %3 */ |
886 |
); |
886 |
); |
Lines 937-952
Link Here
|
937 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
937 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
938 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
938 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
939 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
939 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
940 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
940 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
941 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
941 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
942 |
".L91117: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into mm1 */ |
942 |
"1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into mm1 */ |
943 |
"pxor %%mm1, %%mm0 \n\t" /* negate mm0 by xoring with mm1 */ |
943 |
"pxor %%mm1, %%mm0 \n\t" /* negate mm0 by xoring with mm1 */ |
944 |
"movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
944 |
"movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
945 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
945 |
"add $8, %%eax \n\t" /* increase Src1, Src2 and Dest */ |
946 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
946 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" /* decrease loop counter */ |
947 |
"jnz .L91117 \n\t" /* check loop termination, proceed if required */ |
947 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
948 |
"emms \n\t" /* exit MMX state */ |
948 |
"emms \n\t" /* exit MMX state */ |
949 |
"popa \n\t":"=m" (Dest) /* %0 */ |
949 |
"popa \n\t":"=m" (Dest) /* %0 */ |
950 |
:"m"(Src1), /* %1 */ |
950 |
:"m"(Src1), /* %1 */ |
951 |
"m"(length) /* %2 */ |
951 |
"m"(length) /* %2 */ |
952 |
); |
952 |
); |
Lines 1010-1027
Link Here
|
1010 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1010 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1011 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1011 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1012 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1012 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1013 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1013 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1014 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1014 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1015 |
".L1021: \n\t" |
1015 |
"1: \n\t" |
1016 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ |
1016 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ |
1017 |
"paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ |
1017 |
"paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ |
1018 |
"movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
1018 |
"movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
1019 |
"add $8, %%eax \n\t" /* increase Dest register pointer by 8 */ |
1019 |
"add $8, %%eax \n\t" /* increase Dest register pointer by 8 */ |
1020 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1020 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1021 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1021 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1022 |
"jnz .L1021 \n\t" /* check loop termination, proceed if required */ |
1022 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1023 |
"emms \n\t" /* exit MMX state */ |
1023 |
"emms \n\t" /* exit MMX state */ |
1024 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1024 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1025 |
:"m"(Src1), /* %1 */ |
1025 |
:"m"(Src1), /* %1 */ |
1026 |
"m"(length), /* %2 */ |
1026 |
"m"(length), /* %2 */ |
1027 |
"m"(C) /* %3 */ |
1027 |
"m"(C) /* %3 */ |
Lines 1089-1106
Link Here
|
1089 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1089 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1090 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1090 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1091 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1091 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1092 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1092 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1093 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1093 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1094 |
".L11023: \n\t" |
1094 |
"1: \n\t" |
1095 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1095 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1096 |
"paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ |
1096 |
"paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ |
1097 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1097 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1098 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1098 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1099 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1099 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1100 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1100 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1101 |
"jnz .L11023 \n\t" /* check loop termination, proceed if required */ |
1101 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1102 |
"emms \n\t" /* exit MMX state */ |
1102 |
"emms \n\t" /* exit MMX state */ |
1103 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1103 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1104 |
:"m"(Src1), /* %1 */ |
1104 |
:"m"(Src1), /* %1 */ |
1105 |
"m"(length), /* %2 */ |
1105 |
"m"(length), /* %2 */ |
1106 |
"m"(C), /* %3 */ |
1106 |
"m"(C), /* %3 */ |
Lines 1184-1204
Link Here
|
1184 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1184 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1185 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1185 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1186 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1186 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1187 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1187 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1188 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1188 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1189 |
".L1022: \n\t" |
1189 |
"1: \n\t" |
1190 |
"movq (%%eax), %%mm2 \n\t" /* load 8 bytes from Src1 into MM2 */ |
1190 |
"movq (%%eax), %%mm2 \n\t" /* load 8 bytes from Src1 into MM2 */ |
1191 |
"psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of MM2 1 bit to the right */ |
1191 |
"psrlw $1, %%mm2 \n\t" /* shift 4 WORDS of MM2 1 bit to the right */ |
1192 |
/* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2 */ |
1192 |
/* "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2 */ |
1193 |
".byte 0x0f, 0xdb, 0xd0 \n\t" |
1193 |
".byte 0x0f, 0xdb, 0xd0 \n\t" |
1194 |
"paddusb %%mm1, %%mm2 \n\t" /* MM2=SrcDest+C (add 8 bytes with saturation) */ |
1194 |
"paddusb %%mm1, %%mm2 \n\t" /* MM2=SrcDest+C (add 8 bytes with saturation) */ |
1195 |
"movq %%mm2, (%%edi) \n\t" /* store result in Dest */ |
1195 |
"movq %%mm2, (%%edi) \n\t" /* store result in Dest */ |
1196 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1196 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1197 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1197 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1198 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1198 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1199 |
"jnz .L1022 \n\t" /* check loop termination, proceed if required */ |
1199 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1200 |
"emms \n\t" /* exit MMX state */ |
1200 |
"emms \n\t" /* exit MMX state */ |
1201 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1201 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1202 |
:"m"(Src1), /* %1 */ |
1202 |
:"m"(Src1), /* %1 */ |
1203 |
"m"(length), /* %2 */ |
1203 |
"m"(length), /* %2 */ |
1204 |
"m"(C), /* %3 */ |
1204 |
"m"(C), /* %3 */ |
Lines 1273-1289
Link Here
|
1273 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1273 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1274 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1274 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1275 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1275 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1276 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1276 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1277 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1277 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1278 |
".L1023: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1278 |
"1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1279 |
"psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */ |
1279 |
"psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */ |
1280 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1280 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1281 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1281 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1282 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1282 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1283 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1283 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1284 |
"jnz .L1023 \n\t" /* check loop termination, proceed if required */ |
1284 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1285 |
"emms \n\t" /* exit MMX state */ |
1285 |
"emms \n\t" /* exit MMX state */ |
1286 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1286 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1287 |
:"m"(Src1), /* %1 */ |
1287 |
:"m"(Src1), /* %1 */ |
1288 |
"m"(length), /* %2 */ |
1288 |
"m"(length), /* %2 */ |
1289 |
"m"(C) /* %3 */ |
1289 |
"m"(C) /* %3 */ |
Lines 1352-1368
Link Here
|
1352 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1352 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1353 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1353 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1354 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1354 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1355 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1355 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1356 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1356 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1357 |
".L11024: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1357 |
"1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1358 |
"psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */ |
1358 |
"psubusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest-C (sub 8 bytes with saturation) */ |
1359 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1359 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1360 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1360 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1361 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1361 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1362 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1362 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1363 |
"jnz .L11024 \n\t" /* check loop termination, proceed if required */ |
1363 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1364 |
"emms \n\t" /* exit MMX state */ |
1364 |
"emms \n\t" /* exit MMX state */ |
1365 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1365 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1366 |
:"m"(Src1), /* %1 */ |
1366 |
:"m"(Src1), /* %1 */ |
1367 |
"m"(length), /* %2 */ |
1367 |
"m"(length), /* %2 */ |
1368 |
"m"(C), /* %3 */ |
1368 |
"m"(C), /* %3 */ |
Lines 1435-1466
Link Here
|
1435 |
"movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ |
1435 |
"movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ |
1436 |
"xor %%ecx, %%ecx \n\t" /* zero ECX */ |
1436 |
"xor %%ecx, %%ecx \n\t" /* zero ECX */ |
1437 |
"mov %3, %%cl \n\t" /* load loop counter (N) into CL */ |
1437 |
"mov %3, %%cl \n\t" /* load loop counter (N) into CL */ |
1438 |
"movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */ |
1438 |
"movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */ |
1439 |
"pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ |
1439 |
"pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ |
1440 |
".L10240: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */ |
1440 |
"1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */ |
1441 |
"psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the right */ |
1441 |
"psrlw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the right */ |
1442 |
/* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */ |
1442 |
/* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */ |
1443 |
".byte 0x0f, 0xdb, 0xc8 \n\t" |
1443 |
".byte 0x0f, 0xdb, 0xc8 \n\t" |
1444 |
"dec %%cl \n\t" /* decrease loop counter */ |
1444 |
"dec %%cl \n\t" /* decrease loop counter */ |
1445 |
"jnz .L10240 \n\t" /* check loop termination, proceed if required */ |
1445 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1446 |
/* ** Shift all bytes of the image ** */ |
1446 |
/* ** Shift all bytes of the image ** */ |
1447 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1447 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1448 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1448 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1449 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1449 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1450 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1450 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1451 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1451 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1452 |
".L10241: \n\t" |
1452 |
"2: \n\t" |
1453 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1453 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1454 |
"psrlw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the right */ |
1454 |
"psrlw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the right */ |
1455 |
/* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */ |
1455 |
/* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */ |
1456 |
".byte 0x0f, 0xdb, 0xc1 \n\t" |
1456 |
".byte 0x0f, 0xdb, 0xc1 \n\t" |
1457 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1457 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1458 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1458 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1459 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1459 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1460 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1460 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1461 |
"jnz .L10241 \n\t" /* check loop termination, proceed if required */ |
1461 |
"jnz 2b \n\t" /* check loop termination, proceed if required */ |
1462 |
"emms \n\t" /* exit MMX state */ |
1462 |
"emms \n\t" /* exit MMX state */ |
1463 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1463 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1464 |
:"m"(Src1), /* %1 */ |
1464 |
:"m"(Src1), /* %1 */ |
1465 |
"m"(length), /* %2 */ |
1465 |
"m"(length), /* %2 */ |
1466 |
"m"(N), /* %3 */ |
1466 |
"m"(N), /* %3 */ |
Lines 1525-1541
Link Here
|
1525 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1525 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1526 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1526 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1527 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1527 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1528 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1528 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1529 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1529 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1530 |
".L13023: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1530 |
"1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1531 |
"psrld %3, %%mm0 \n\t" |
1531 |
"psrld %3, %%mm0 \n\t" |
1532 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1532 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1533 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1533 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1534 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1534 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1535 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1535 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1536 |
"jnz .L13023 \n\t" /* check loop termination, proceed if required */ |
1536 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1537 |
"emms \n\t" /* exit MMX state */ |
1537 |
"emms \n\t" /* exit MMX state */ |
1538 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1538 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1539 |
:"m"(Src1), /* %1 */ |
1539 |
:"m"(Src1), /* %1 */ |
1540 |
"m"(length), /* %2 */ |
1540 |
"m"(length), /* %2 */ |
1541 |
"m"(N) /* %3 */ |
1541 |
"m"(N) /* %3 */ |
Lines 1611-1635
Link Here
|
1611 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1611 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1612 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1612 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1613 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1613 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1614 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1614 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1615 |
"cmp $128, %%al \n\t" /* if (C <= 128) execute more efficient code */ |
1615 |
"cmp $128, %%al \n\t" /* if (C <= 128) execute more efficient code */ |
1616 |
"jg .L10251 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1616 |
"jg 2f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1617 |
".L10250: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1617 |
"1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1618 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1618 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1619 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1619 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1620 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1620 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1621 |
"pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */ |
1621 |
"pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */ |
1622 |
"pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */ |
1622 |
"pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */ |
1623 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1623 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1624 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1624 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1625 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1625 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1626 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1626 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1627 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1627 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1628 |
"jnz .L10250 \n\t" /* check loop termination, proceed if required */ |
1628 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1629 |
"jmp .L10252 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1629 |
"jmp 3f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1630 |
".L10251: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1630 |
"2: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1631 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1631 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1632 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1632 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1633 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1633 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1634 |
"pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */ |
1634 |
"pmullw %%mm1, %%mm3 \n\t" /* mul low bytes of SrcDest and MM1 */ |
1635 |
"pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */ |
1635 |
"pmullw %%mm1, %%mm4 \n\t" /* mul high bytes of SrcDest and MM1 */ |
Lines 1645-1656
Link Here
|
1645 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1645 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1646 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1646 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1647 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1647 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1648 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1648 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1649 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1649 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1650 |
"jnz .L10251 \n\t" /* check loop termination, proceed if required */ |
1650 |
"jnz 2b \n\t" /* check loop termination, proceed if required */ |
1651 |
".L10252: \n\t" "emms \n\t" /* exit MMX state */ |
1651 |
"3: emms \n\t" /* exit MMX state */ |
1652 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1652 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1653 |
:"m"(Src1), /* %1 */ |
1653 |
:"m"(Src1), /* %1 */ |
1654 |
"m"(length), /* %2 */ |
1654 |
"m"(length), /* %2 */ |
1655 |
"m"(C) /* %3 */ |
1655 |
"m"(C) /* %3 */ |
1656 |
); |
1656 |
); |
Lines 1725-1736
Link Here
|
1725 |
"pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */ |
1725 |
"pxor %%mm0, %%mm0 \n\t" /* zero MM0 register */ |
1726 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1726 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1727 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1727 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1728 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1728 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1729 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1729 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1730 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1730 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1731 |
".L1026: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1731 |
"1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1732 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1732 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1733 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1733 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1734 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1734 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1735 |
"psrlw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ |
1735 |
"psrlw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ |
1736 |
"psrlw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ |
1736 |
"psrlw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ |
Lines 1739-1749
Link Here
|
1739 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1739 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1740 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1740 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1741 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1741 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1742 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1742 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1743 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1743 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1744 |
"jnz .L1026 \n\t" /* check loop termination, proceed if required */ |
1744 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1745 |
"emms \n\t" /* exit MMX state */ |
1745 |
"emms \n\t" /* exit MMX state */ |
1746 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1746 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1747 |
:"m"(Src1), /* %1 */ |
1747 |
:"m"(Src1), /* %1 */ |
1748 |
"m"(length), /* %2 */ |
1748 |
"m"(length), /* %2 */ |
1749 |
"m"(N), /* %3 */ |
1749 |
"m"(N), /* %3 */ |
Lines 1814-1842
Link Here
|
1814 |
"movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ |
1814 |
"movq (%%edx), %%mm0 \n\t" /* load Mask into mm0 */ |
1815 |
"xor %%ecx, %%ecx \n\t" /* zero ECX */ |
1815 |
"xor %%ecx, %%ecx \n\t" /* zero ECX */ |
1816 |
"mov %3, %%cl \n\t" /* load loop counter (N) into CL */ |
1816 |
"mov %3, %%cl \n\t" /* load loop counter (N) into CL */ |
1817 |
"movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */ |
1817 |
"movd %%ecx, %%mm3 \n\t" /* copy (N) into MM3 */ |
1818 |
"pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ |
1818 |
"pcmpeqb %%mm1, %%mm1 \n\t" /* generate all 1's in mm1 */ |
1819 |
".L10270: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */ |
1819 |
"1: \n\t" /* ** Prepare proper bit-Mask in MM1 ** */ |
1820 |
"psllw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the left */ |
1820 |
"psllw $1, %%mm1 \n\t" /* shift 4 WORDS of MM1 1 bit to the left */ |
1821 |
/* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */ |
1821 |
/* "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 */ |
1822 |
".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" /* decrease loop counter */ |
1822 |
".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" /* decrease loop counter */ |
1823 |
"jnz .L10270 \n\t" /* check loop termination, proceed if required */ |
1823 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1824 |
/* ** Shift all bytes of the image ** */ |
1824 |
/* ** Shift all bytes of the image ** */ |
1825 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1825 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1826 |
"mov %0, %%edi \n\t" /* load SrcDest address into edi */ |
1826 |
"mov %0, %%edi \n\t" /* load SrcDest address into edi */ |
1827 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1827 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1828 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1828 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1829 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1829 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1830 |
".L10271: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ |
1830 |
"2: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ |
1831 |
"psllw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the left */ |
1831 |
"psllw %%mm3, %%mm0 \n\t" /* shift 4 WORDS of MM0 (N) bits to the left */ |
1832 |
/* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */ |
1832 |
/* "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 */ |
1833 |
".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
1833 |
".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
1834 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1834 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1835 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1835 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1836 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1836 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1837 |
"jnz .L10271 \n\t" /* check loop termination, proceed if required */ |
1837 |
"jnz 2b \n\t" /* check loop termination, proceed if required */ |
1838 |
"emms \n\t" /* exit MMX state */ |
1838 |
"emms \n\t" /* exit MMX state */ |
1839 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1839 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1840 |
:"m"(Src1), /* %1 */ |
1840 |
:"m"(Src1), /* %1 */ |
1841 |
"m"(length), /* %2 */ |
1841 |
"m"(length), /* %2 */ |
1842 |
"m"(N), /* %3 */ |
1842 |
"m"(N), /* %3 */ |
Lines 1900-1916
Link Here
|
1900 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1900 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1901 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1901 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1902 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1902 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1903 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1903 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1904 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1904 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1905 |
".L12023: \n\t" "movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1905 |
"1: movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
1906 |
"pslld %3, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ |
1906 |
"pslld %3, %%mm0 \n\t" /* MM0=SrcDest+C (add 8 bytes with saturation) */ |
1907 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1907 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
1908 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1908 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1909 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1909 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1910 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1910 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1911 |
"jnz .L12023 \n\t" /* check loop termination, proceed if required */ |
1911 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1912 |
"emms \n\t" /* exit MMX state */ |
1912 |
"emms \n\t" /* exit MMX state */ |
1913 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1913 |
"popa \n\t":"=m" (Dest) /* %0 */ |
1914 |
:"m"(Src1), /* %1 */ |
1914 |
:"m"(Src1), /* %1 */ |
1915 |
"m"(length), /* %2 */ |
1915 |
"m"(length), /* %2 */ |
1916 |
"m"(N) /* %3 */ |
1916 |
"m"(N) /* %3 */ |
Lines 1979-2003
Link Here
|
1979 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1979 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
1980 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1980 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
1981 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1981 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
1982 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1982 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
1983 |
"cmp $7, %%al \n\t" /* if (N <= 7) execute more efficient code */ |
1983 |
"cmp $7, %%al \n\t" /* if (N <= 7) execute more efficient code */ |
1984 |
"jg .L10281 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1984 |
"jg 2f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1985 |
".L10280: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1985 |
"1: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1986 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1986 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1987 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1987 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
1988 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1988 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
1989 |
"psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ |
1989 |
"psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ |
1990 |
"psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ |
1990 |
"psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ |
1991 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1991 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
1992 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1992 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
1993 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1993 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
1994 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1994 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
1995 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1995 |
"dec %%ecx \n\t" /* decrease loop counter */ |
1996 |
"jnz .L10280 \n\t" /* check loop termination, proceed if required */ |
1996 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
1997 |
"jmp .L10282 \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1997 |
"jmp 3f \n\t" ".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
1998 |
".L10281: \n\t" "movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1998 |
"2: movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
1999 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
1999 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
2000 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
2000 |
"punpcklbw %%mm0, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
2001 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
2001 |
"punpckhbw %%mm0, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
2002 |
"psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ |
2002 |
"psllw %%mm7, %%mm3 \n\t" /* shift 4 WORDS of MM3 (N) bits to the right */ |
2003 |
"psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ |
2003 |
"psllw %%mm7, %%mm4 \n\t" /* shift 4 WORDS of MM4 (N) bits to the right */ |
Lines 2013-2024
Link Here
|
2013 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
2013 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
2014 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
2014 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
2015 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2015 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2016 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2016 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2017 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2017 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2018 |
"jnz .L10281 \n\t" /* check loop termination, proceed if required */ |
2018 |
"jnz 2b \n\t" /* check loop termination, proceed if required */ |
2019 |
".L10282: \n\t" "emms \n\t" /* exit MMX state */ |
2019 |
"3: emms \n\t" /* exit MMX state */ |
2020 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2020 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2021 |
:"m"(Src1), /* %1 */ |
2021 |
:"m"(Src1), /* %1 */ |
2022 |
"m"(length), /* %2 */ |
2022 |
"m"(length), /* %2 */ |
2023 |
"m"(N) /* %3 */ |
2023 |
"m"(N) /* %3 */ |
2024 |
); |
2024 |
); |
Lines 2093-2111
Link Here
|
2093 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
2093 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
2094 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
2094 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
2095 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
2095 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
2096 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
2096 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
2097 |
".align 16 \n\t" /* 16 byte alignment of the loop entry */ |
2097 |
".align 16 \n\t" /* 16 byte alignment of the loop entry */ |
2098 |
".L1029: \n\t" |
2098 |
"1: \n\t" |
2099 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
2099 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from SrcDest into MM0 */ |
2100 |
"paddusb %%mm2, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */ |
2100 |
"paddusb %%mm2, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) */ |
2101 |
"pcmpeqb %%mm1, %%mm0 \n\t" /* binarize 255:0, comparing to 255 */ |
2101 |
"pcmpeqb %%mm1, %%mm0 \n\t" /* binarize 255:0, comparing to 255 */ |
2102 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
2102 |
"movq %%mm0, (%%edi) \n\t" /* store result in SrcDest */ |
2103 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2103 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2104 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2104 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2105 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2105 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2106 |
"jnz .L1029 \n\t" /* check loop termination, proceed if required */ |
2106 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
2107 |
"emms \n\t" /* exit MMX state */ |
2107 |
"emms \n\t" /* exit MMX state */ |
2108 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2108 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2109 |
:"m"(Src1), /* %1 */ |
2109 |
:"m"(Src1), /* %1 */ |
2110 |
"m"(length), /* %2 */ |
2110 |
"m"(length), /* %2 */ |
2111 |
"m"(T) /* %3 */ |
2111 |
"m"(T) /* %3 */ |
Lines 2184-2203
Link Here
|
2184 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
2184 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
2185 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
2185 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
2186 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
2186 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
2187 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
2187 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
2188 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
2188 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
2189 |
".L1030: \n\t" |
2189 |
"1: \n\t" |
2190 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ |
2190 |
"movq (%%eax), %%mm0 \n\t" /* load 8 bytes from Src1 into MM0 */ |
2191 |
"paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-Tmax) */ |
2191 |
"paddusb %%mm1, %%mm0 \n\t" /* MM0=SrcDest+(0xFF-Tmax) */ |
2192 |
"psubusb %%mm7, %%mm0 \n\t" /* MM0=MM0-(0xFF-Tmax+Tmin) */ |
2192 |
"psubusb %%mm7, %%mm0 \n\t" /* MM0=MM0-(0xFF-Tmax+Tmin) */ |
2193 |
"paddusb %%mm5, %%mm0 \n\t" /* MM0=MM0+Tmin */ |
2193 |
"paddusb %%mm5, %%mm0 \n\t" /* MM0=MM0+Tmin */ |
2194 |
"movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
2194 |
"movq %%mm0, (%%edi) \n\t" /* store result in Dest */ |
2195 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2195 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2196 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2196 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2197 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2197 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2198 |
"jnz .L1030 \n\t" /* check loop termination, proceed if required */ |
2198 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
2199 |
"emms \n\t" /* exit MMX state */ |
2199 |
"emms \n\t" /* exit MMX state */ |
2200 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2200 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2201 |
:"m"(Src1), /* %1 */ |
2201 |
:"m"(Src1), /* %1 */ |
2202 |
"m"(length), /* %2 */ |
2202 |
"m"(length), /* %2 */ |
2203 |
"m"(Tmin), /* %3 */ |
2203 |
"m"(Tmin), /* %3 */ |
Lines 2261-2275
Link Here
|
2261 |
asm volatile |
2261 |
asm volatile |
2262 |
("pusha \n\t" "mov %6, %%ax \n\t" /* load Nmax in AX */ |
2262 |
("pusha \n\t" "mov %6, %%ax \n\t" /* load Nmax in AX */ |
2263 |
"mov %4, %%bx \n\t" /* load Cmax in BX */ |
2263 |
"mov %4, %%bx \n\t" /* load Cmax in BX */ |
2264 |
"sub %5, %%ax \n\t" /* AX = Nmax - Nmin */ |
2264 |
"sub %5, %%ax \n\t" /* AX = Nmax - Nmin */ |
2265 |
"sub %3, %%bx \n\t" /* BX = Cmax - Cmin */ |
2265 |
"sub %3, %%bx \n\t" /* BX = Cmax - Cmin */ |
2266 |
"jz .L10311 \n\t" /* check division by zero */ |
2266 |
"jz 1f \n\t" /* check division by zero */ |
2267 |
"xor %%dx, %%dx \n\t" /* prepare for division, zero DX */ |
2267 |
"xor %%dx, %%dx \n\t" /* prepare for division, zero DX */ |
2268 |
"div %%bx \n\t" /* AX = AX/BX */ |
2268 |
"div %%bx \n\t" /* AX = AX/BX */ |
2269 |
"jmp .L10312 \n\t" ".L10311: \n\t" "mov $255, %%ax \n\t" /* if div by zero, assume result max. byte value */ |
2269 |
"jmp 2f \n\t" "1: \n\t" "mov $255, %%ax \n\t" /* if div by zero, assume result max. byte value */ |
2270 |
".L10312: \n\t" /* ** Duplicate AX in 4 words of MM0 ** */ |
2270 |
"2: \n\t" /* ** Duplicate AX in 4 words of MM0 ** */ |
2271 |
"mov %%ax, %%bx \n\t" /* copy AX into BX */ |
2271 |
"mov %%ax, %%bx \n\t" /* copy AX into BX */ |
2272 |
"shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ |
2272 |
"shl $16, %%eax \n\t" /* shift 2 bytes of EAX left */ |
2273 |
"mov %%bx, %%ax \n\t" /* copy BX into AX */ |
2273 |
"mov %%bx, %%ax \n\t" /* copy BX into AX */ |
2274 |
"movd %%eax, %%mm0 \n\t" /* copy EAX into MM0 */ |
2274 |
"movd %%eax, %%mm0 \n\t" /* copy EAX into MM0 */ |
2275 |
"movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ |
2275 |
"movd %%eax, %%mm1 \n\t" /* copy EAX into MM1 */ |
Lines 2294-2304
Link Here
|
2294 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
2294 |
"mov %1, %%eax \n\t" /* load Src1 address into eax */ |
2295 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
2295 |
"mov %0, %%edi \n\t" /* load Dest address into edi */ |
2296 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
2296 |
"mov %2, %%ecx \n\t" /* load loop counter (SIZE) into ecx */ |
2297 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
2297 |
"shr $3, %%ecx \n\t" /* counter/8 (MMX loads 8 bytes at a time) */ |
2298 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
2298 |
".align 16 \n\t" /* 16 byte allignment of the loop entry */ |
2299 |
".L1031: \n\t" |
2299 |
"1: \n\t" |
2300 |
"movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
2300 |
"movq (%%eax), %%mm3 \n\t" /* load 8 bytes from Src1 into MM3 */ |
2301 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
2301 |
"movq %%mm3, %%mm4 \n\t" /* copy MM3 into MM4 */ |
2302 |
"punpcklbw %%mm7, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
2302 |
"punpcklbw %%mm7, %%mm3 \n\t" /* unpack low bytes of SrcDest into words */ |
2303 |
"punpckhbw %%mm7, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
2303 |
"punpckhbw %%mm7, %%mm4 \n\t" /* unpack high bytes of SrcDest into words */ |
2304 |
"psubusb %%mm1, %%mm3 \n\t" /* S-Cmin, low bytes */ |
2304 |
"psubusb %%mm1, %%mm3 \n\t" /* S-Cmin, low bytes */ |
Lines 2319-2329
Link Here
|
2319 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
2319 |
"packuswb %%mm4, %%mm3 \n\t" /* pack words back into bytes with saturation */ |
2320 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
2320 |
"movq %%mm3, (%%edi) \n\t" /* store result in Dest */ |
2321 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2321 |
"add $8, %%eax \n\t" /* increase Src1 register pointer by 8 */ |
2322 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2322 |
"add $8, %%edi \n\t" /* increase Dest register pointer by 8 */ |
2323 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2323 |
"dec %%ecx \n\t" /* decrease loop counter */ |
2324 |
"jnz .L1031 \n\t" /* check loop termination, proceed if required */ |
2324 |
"jnz 1b \n\t" /* check loop termination, proceed if required */ |
2325 |
"emms \n\t" /* exit MMX state */ |
2325 |
"emms \n\t" /* exit MMX state */ |
2326 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2326 |
"popa \n\t":"=m" (Dest) /* %0 */ |
2327 |
:"m"(Src1), /* %1 */ |
2327 |
:"m"(Src1), /* %1 */ |
2328 |
"m"(length), /* %2 */ |
2328 |
"m"(length), /* %2 */ |
2329 |
"m"(Cmin), /* %3 */ |
2329 |
"m"(Cmin), /* %3 */ |