Lines 81-93
Link Here
|
81 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
81 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
82 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
82 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
83 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
83 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
84 |
".L1010: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
84 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
85 |
"paddusb (%%ebx), %%mm1 \n\t" // mm1=Src1+Src2 (add 8 bytes with saturation) |
85 |
"paddusb (%%ebx), %%mm1 \n\t" // mm1=Src1+Src2 (add 8 bytes with saturation) |
86 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
86 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
87 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
87 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
88 |
"add $8, %%ebx \n\t" // register pointers by 8 |
88 |
"add $8, %%ebx \n\t" // register pointers by 8 |
89 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
89 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
90 |
"jnz .L1010 \n\t" // check loop termination, proceed if required |
90 |
"jnz 1b \n\t" // check loop termination, proceed if required |
91 |
"emms \n\t" // exit MMX state |
91 |
"emms \n\t" // exit MMX state |
92 |
"popa \n\t":"=m" (Dest) // %0 |
92 |
"popa \n\t":"=m" (Dest) // %0 |
93 |
:"m"(Src2), // %1 |
93 |
:"m"(Src2), // %1 |
Lines 158-164
Link Here
|
158 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
158 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
159 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
159 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
160 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
160 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
161 |
".L21011: \n\t" |
161 |
"1: \n\t" |
162 |
"movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
162 |
"movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
163 |
"movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2 |
163 |
"movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2 |
164 |
// --- Byte shift via Word shift --- |
164 |
// --- Byte shift via Word shift --- |
Lines 174-180
Link Here
|
174 |
"add $8, %%ebx \n\t" // register pointers by 8 |
174 |
"add $8, %%ebx \n\t" // register pointers by 8 |
175 |
"add $8, %%edi \n\t" |
175 |
"add $8, %%edi \n\t" |
176 |
"dec %%ecx \n\t" // decrease loop counter |
176 |
"dec %%ecx \n\t" // decrease loop counter |
177 |
"jnz .L21011 \n\t" // check loop termination, proceed if required |
177 |
"jnz 1b \n\t" // check loop termination, proceed if required |
178 |
"emms \n\t" // exit MMX state |
178 |
"emms \n\t" // exit MMX state |
179 |
"popa \n\t":"=m" (Dest) // %0 |
179 |
"popa \n\t":"=m" (Dest) // %0 |
180 |
:"m"(Src2), // %1 |
180 |
:"m"(Src2), // %1 |
Lines 241-253
Link Here
|
241 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
241 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
242 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
242 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
243 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
243 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
244 |
".L1012: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
244 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
245 |
"psubusb (%%ebx), %%mm1 \n\t" // mm1=Src1-Src2 (sub 8 bytes with saturation) |
245 |
"psubusb (%%ebx), %%mm1 \n\t" // mm1=Src1-Src2 (sub 8 bytes with saturation) |
246 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
246 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
247 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
247 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
248 |
"add $8, %%ebx \n\t" // register pointers by 8 |
248 |
"add $8, %%ebx \n\t" // register pointers by 8 |
249 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
249 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
250 |
"jnz .L1012 \n\t" // check loop termination, proceed if required |
250 |
"jnz 1b \n\t" // check loop termination, proceed if required |
251 |
"emms \n\t" // exit MMX state |
251 |
"emms \n\t" // exit MMX state |
252 |
"popa \n\t":"=m" (Dest) // %0 |
252 |
"popa \n\t":"=m" (Dest) // %0 |
253 |
:"m"(Src2), // %1 |
253 |
:"m"(Src2), // %1 |
Lines 313-319
Link Here
|
313 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
313 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
314 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
314 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
315 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
315 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
316 |
".L1013: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
316 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
317 |
"movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2 |
317 |
"movq (%%ebx), %%mm2 \n\t" // load 8 bytes from Src2 into mm2 |
318 |
"psubusb (%%ebx), %%mm1 \n\t" // mm1=Src1-Src2 (sub 8 bytes with saturation) |
318 |
"psubusb (%%ebx), %%mm1 \n\t" // mm1=Src1-Src2 (sub 8 bytes with saturation) |
319 |
"psubusb (%%eax), %%mm2 \n\t" // mm2=Src2-Src1 (sub 8 bytes with saturation) |
319 |
"psubusb (%%eax), %%mm2 \n\t" // mm2=Src2-Src1 (sub 8 bytes with saturation) |
Lines 322-328
Link Here
|
322 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
322 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
323 |
"add $8, %%ebx \n\t" // register pointers by 8 |
323 |
"add $8, %%ebx \n\t" // register pointers by 8 |
324 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
324 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
325 |
"jnz .L1013 \n\t" // check loop termination, proceed if required |
325 |
"jnz 1b \n\t" // check loop termination, proceed if required |
326 |
"emms \n\t" // exit MMX state |
326 |
"emms \n\t" // exit MMX state |
327 |
"popa \n\t":"=m" (Dest) // %0 |
327 |
"popa \n\t":"=m" (Dest) // %0 |
328 |
:"m"(Src2), // %1 |
328 |
:"m"(Src2), // %1 |
Lines 388-394
Link Here
|
388 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
388 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
389 |
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register |
389 |
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register |
390 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
390 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
391 |
".L1014: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
391 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
392 |
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3 |
392 |
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3 |
393 |
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2 |
393 |
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2 |
394 |
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4 |
394 |
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4 |
Lines 412-418
Link Here
|
412 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
412 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
413 |
"add $8, %%ebx \n\t" // register pointers by 8 |
413 |
"add $8, %%ebx \n\t" // register pointers by 8 |
414 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
414 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
415 |
"jnz .L1014 \n\t" // check loop termination, proceed if required |
415 |
"jnz 1b \n\t" // check loop termination, proceed if required |
416 |
"emms \n\t" // exit MMX state |
416 |
"emms \n\t" // exit MMX state |
417 |
"popa \n\t":"=m" (Dest) // %0 |
417 |
"popa \n\t":"=m" (Dest) // %0 |
418 |
:"m"(Src2), // %1 |
418 |
:"m"(Src2), // %1 |
Lines 481-493
Link Here
|
481 |
"mov %0, %%edi \n\t" // load Dest address into edi |
481 |
"mov %0, %%edi \n\t" // load Dest address into edi |
482 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
482 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
483 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
483 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
484 |
".L10141: \n\t" "mov (%%edx), %%al \n\t" // load a byte from Src1 |
484 |
"1:mov (%%edx), %%al \n\t" // load a byte from Src1 |
485 |
"mulb (%%esi) \n\t" // mul with a byte from Src2 |
485 |
"mulb (%%esi) \n\t" // mul with a byte from Src2 |
486 |
".L10142: \n\t" "mov %%al, (%%edi) \n\t" // move a byte result to Dest |
486 |
"mov %%al, (%%edi) \n\t" // move a byte result to Dest |
487 |
"inc %%edx \n\t" // increment Src1, Src2, Dest |
487 |
"inc %%edx \n\t" // increment Src1, Src2, Dest |
488 |
"inc %%esi \n\t" // pointer registers by one |
488 |
"inc %%esi \n\t" // pointer registers by one |
489 |
"inc %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
489 |
"inc %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
490 |
"jnz .L10141 \n\t" // check loop termination, proceed if required |
490 |
"jnz 1b \n\t" // check loop termination, proceed if required |
491 |
"popa \n\t":"=m" (Dest) // %0 |
491 |
"popa \n\t":"=m" (Dest) // %0 |
492 |
:"m"(Src2), // %1 |
492 |
:"m"(Src2), // %1 |
493 |
"m"(Src1), // %2 |
493 |
"m"(Src1), // %2 |
Lines 549-555
Link Here
|
549 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
549 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
550 |
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register |
550 |
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register |
551 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
551 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
552 |
".L1015: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
552 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
553 |
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3 |
553 |
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3 |
554 |
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2 |
554 |
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2 |
555 |
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4 |
555 |
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4 |
Lines 566-572
Link Here
|
566 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
566 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
567 |
"add $8, %%ebx \n\t" // register pointers by 8 |
567 |
"add $8, %%ebx \n\t" // register pointers by 8 |
568 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
568 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
569 |
"jnz .L1015 \n\t" // check loop termination, proceed if required |
569 |
"jnz 1b \n\t" // check loop termination, proceed if required |
570 |
"emms \n\t" // exit MMX state |
570 |
"emms \n\t" // exit MMX state |
571 |
"popa \n\t":"=m" (Dest) // %0 |
571 |
"popa \n\t":"=m" (Dest) // %0 |
572 |
:"m"(Src2), // %1 |
572 |
:"m"(Src2), // %1 |
Lines 634-640
Link Here
|
634 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
634 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
635 |
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register |
635 |
"pxor %%mm0, %%mm0 \n\t" // zero mm0 register |
636 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
636 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
637 |
".L1016: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
637 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
638 |
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3 |
638 |
"movq (%%ebx), %%mm3 \n\t" // load 8 bytes from Src2 into mm3 |
639 |
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2 |
639 |
"movq %%mm1, %%mm2 \n\t" // copy mm1 into mm2 |
640 |
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4 |
640 |
"movq %%mm3, %%mm4 \n\t" // copy mm3 into mm4 |
Lines 653-659
Link Here
|
653 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
653 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
654 |
"add $8, %%ebx \n\t" // register pointers by 8 |
654 |
"add $8, %%ebx \n\t" // register pointers by 8 |
655 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
655 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
656 |
"jnz .L1016 \n\t" // check loop termination, proceed if required |
656 |
"jnz 1b \n\t" // check loop termination, proceed if required |
657 |
"emms \n\t" // exit MMX state |
657 |
"emms \n\t" // exit MMX state |
658 |
"popa \n\t":"=m" (Dest) // %0 |
658 |
"popa \n\t":"=m" (Dest) // %0 |
659 |
:"m"(Src2), // %1 |
659 |
:"m"(Src2), // %1 |
Lines 720-732
Link Here
|
720 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
720 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
721 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
721 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
722 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
722 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
723 |
".L1017: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
723 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
724 |
"pand (%%ebx), %%mm1 \n\t" // mm1=Src1&Src2 |
724 |
"pand (%%ebx), %%mm1 \n\t" // mm1=Src1&Src2 |
725 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
725 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
726 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
726 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
727 |
"add $8, %%ebx \n\t" // register pointers by 8 |
727 |
"add $8, %%ebx \n\t" // register pointers by 8 |
728 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
728 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
729 |
"jnz .L1017 \n\t" // check loop termination, proceed if required |
729 |
"jnz 1b \n\t" // check loop termination, proceed if required |
730 |
"emms \n\t" // exit MMX state |
730 |
"emms \n\t" // exit MMX state |
731 |
"popa \n\t":"=m" (Dest) // %0 |
731 |
"popa \n\t":"=m" (Dest) // %0 |
732 |
:"m"(Src2), // %1 |
732 |
:"m"(Src2), // %1 |
Lines 792-804
Link Here
|
792 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
792 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
793 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
793 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
794 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
794 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
795 |
".L91017: \n\t" "movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
795 |
"1: movq (%%eax), %%mm1 \n\t" // load 8 bytes from Src1 into mm1 |
796 |
"por (%%ebx), %%mm1 \n\t" // mm1=Src1|Src2 |
796 |
"por (%%ebx), %%mm1 \n\t" // mm1=Src1|Src2 |
797 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
797 |
"movq %%mm1, (%%edi) \n\t" // store result in Dest |
798 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
798 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
799 |
"add $8, %%ebx \n\t" // register pointers by 8 |
799 |
"add $8, %%ebx \n\t" // register pointers by 8 |
800 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
800 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
801 |
"jnz .L91017 \n\t" // check loop termination, proceed if required |
801 |
"jnz 1b \n\t" // check loop termination, proceed if required |
802 |
"emms \n\t" // exit MMX state |
802 |
"emms \n\t" // exit MMX state |
803 |
"popa \n\t":"=m" (Dest) // %0 |
803 |
"popa \n\t":"=m" (Dest) // %0 |
804 |
:"m"(Src2), // %1 |
804 |
:"m"(Src2), // %1 |
Lines 860-876
Link Here
|
860 |
"mov %0, %%edi \n\t" // load Dest address into edi |
860 |
"mov %0, %%edi \n\t" // load Dest address into edi |
861 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
861 |
"mov %3, %%ecx \n\t" // load loop counter (SIZE) into ecx |
862 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
862 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
863 |
".L10191: \n\t" "mov (%%esi), %%bl \n\t" // load a byte from Src2 |
863 |
"1: mov (%%esi), %%bl \n\t" // load a byte from Src2 |
864 |
"cmp $0, %%bl \n\t" // check if it zero |
864 |
"cmp $0, %%bl \n\t" // check if it zero |
865 |
"jnz .L10192 \n\t" "movb $255, (%%edi) \n\t" // division by zero = 255 !!! |
865 |
"jnz 2f \n\t" "movb $255, (%%edi) \n\t" // division by zero = 255 !!! |
866 |
"jmp .L10193 \n\t" ".L10192: \n\t" "xor %%ah, %%ah \n\t" // prepare AX, zero AH register |
866 |
"jmp 3f \n\t" "2: \n\t" "xor %%ah, %%ah \n\t" // prepare AX, zero AH register |
867 |
"mov (%%edx), %%al \n\t" // load a byte from Src1 into AL |
867 |
"mov (%%edx), %%al \n\t" // load a byte from Src1 into AL |
868 |
"div %%bl \n\t" // divide AL by BL |
868 |
"div %%bl \n\t" // divide AL by BL |
869 |
"mov %%al, (%%edi) \n\t" // move a byte result to Dest |
869 |
"mov %%al, (%%edi) \n\t" // move a byte result to Dest |
870 |
".L10193: \n\t" "inc %%edx \n\t" // increment Src1, Src2, Dest |
870 |
"3: inc %%edx \n\t" // increment Src1, Src2, Dest |
871 |
"inc %%esi \n\t" // pointer registers by one |
871 |
"inc %%esi \n\t" // pointer registers by one |
872 |
"inc %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
872 |
"inc %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
873 |
"jnz .L10191 \n\t" // check loop termination, proceed if required |
873 |
"jnz 1b \n\t" // check loop termination, proceed if required |
874 |
"popa \n\t":"=m" (Dest) // %0 |
874 |
"popa \n\t":"=m" (Dest) // %0 |
875 |
:"m"(Src2), // %1 |
875 |
:"m"(Src2), // %1 |
876 |
"m"(Src1), // %2 |
876 |
"m"(Src1), // %2 |
Lines 907-918
Link Here
|
907 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
907 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
908 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
908 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
909 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
909 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
910 |
".L91117: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into mm1 |
910 |
"1: movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into mm1 |
911 |
"pxor %%mm1, %%mm0 \n\t" // negate mm0 by xoring with mm1 |
911 |
"pxor %%mm1, %%mm0 \n\t" // negate mm0 by xoring with mm1 |
912 |
"movq %%mm0, (%%edi) \n\t" // store result in Dest |
912 |
"movq %%mm0, (%%edi) \n\t" // store result in Dest |
913 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
913 |
"add $8, %%eax \n\t" // increase Src1, Src2 and Dest |
914 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
914 |
"add $8, %%edi \n\t" "dec %%ecx \n\t" // decrease loop counter |
915 |
"jnz .L91117 \n\t" // check loop termination, proceed if required |
915 |
"jnz 1b \n\t" // check loop termination, proceed if required |
916 |
"emms \n\t" // exit MMX state |
916 |
"emms \n\t" // exit MMX state |
917 |
"popa \n\t":"=m" (Dest) // %0 |
917 |
"popa \n\t":"=m" (Dest) // %0 |
918 |
:"m"(Src1), // %1 |
918 |
:"m"(Src1), // %1 |
Lines 980-993
Link Here
|
980 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
980 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
981 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
981 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
982 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
982 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
983 |
".L1021: \n\t" |
983 |
"1: \n\t" |
984 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0 |
984 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0 |
985 |
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
985 |
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
986 |
"movq %%mm0, (%%edi) \n\t" // store result in Dest |
986 |
"movq %%mm0, (%%edi) \n\t" // store result in Dest |
987 |
"add $8, %%eax \n\t" // increase Dest register pointer by 8 |
987 |
"add $8, %%eax \n\t" // increase Dest register pointer by 8 |
988 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
988 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
989 |
"dec %%ecx \n\t" // decrease loop counter |
989 |
"dec %%ecx \n\t" // decrease loop counter |
990 |
"jnz .L1021 \n\t" // check loop termination, proceed if required |
990 |
"jnz 1b \n\t" // check loop termination, proceed if required |
991 |
"emms \n\t" // exit MMX state |
991 |
"emms \n\t" // exit MMX state |
992 |
"popa \n\t":"=m" (Dest) // %0 |
992 |
"popa \n\t":"=m" (Dest) // %0 |
993 |
:"m"(Src1), // %1 |
993 |
:"m"(Src1), // %1 |
Lines 1059-1072
Link Here
|
1059 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1059 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1060 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1060 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1061 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1061 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1062 |
".L11023: \n\t" |
1062 |
"1: \n\t" |
1063 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1063 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1064 |
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
1064 |
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
1065 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1065 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1066 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1066 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1067 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1067 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1068 |
"dec %%ecx \n\t" // decrease loop counter |
1068 |
"dec %%ecx \n\t" // decrease loop counter |
1069 |
"jnz .L11023 \n\t" // check loop termination, proceed if required |
1069 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1070 |
"emms \n\t" // exit MMX state |
1070 |
"emms \n\t" // exit MMX state |
1071 |
"popa \n\t":"=m" (Dest) // %0 |
1071 |
"popa \n\t":"=m" (Dest) // %0 |
1072 |
:"m"(Src1), // %1 |
1072 |
:"m"(Src1), // %1 |
Lines 1154-1160
Link Here
|
1154 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1154 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1155 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1155 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1156 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1156 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1157 |
".L1022: \n\t" |
1157 |
"1: \n\t" |
1158 |
"movq (%%eax), %%mm2 \n\t" // load 8 bytes from Src1 into MM2 |
1158 |
"movq (%%eax), %%mm2 \n\t" // load 8 bytes from Src1 into MM2 |
1159 |
"psrlw $1, %%mm2 \n\t" // shift 4 WORDS of MM2 1 bit to the right |
1159 |
"psrlw $1, %%mm2 \n\t" // shift 4 WORDS of MM2 1 bit to the right |
1160 |
// "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2 |
1160 |
// "pand %%mm0, %%mm2 \n\t" // apply Mask to 8 BYTES of MM2 |
Lines 1164-1170
Link Here
|
1164 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1164 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1165 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1165 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1166 |
"dec %%ecx \n\t" // decrease loop counter |
1166 |
"dec %%ecx \n\t" // decrease loop counter |
1167 |
"jnz .L1022 \n\t" // check loop termination, proceed if required |
1167 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1168 |
"emms \n\t" // exit MMX state |
1168 |
"emms \n\t" // exit MMX state |
1169 |
"popa \n\t":"=m" (Dest) // %0 |
1169 |
"popa \n\t":"=m" (Dest) // %0 |
1170 |
:"m"(Src1), // %1 |
1170 |
:"m"(Src1), // %1 |
Lines 1243-1255
Link Here
|
1243 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1243 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1244 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1244 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1245 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1245 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1246 |
".L1023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1246 |
"1: movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1247 |
"psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation) |
1247 |
"psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation) |
1248 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1248 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1249 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1249 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1250 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1250 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1251 |
"dec %%ecx \n\t" // decrease loop counter |
1251 |
"dec %%ecx \n\t" // decrease loop counter |
1252 |
"jnz .L1023 \n\t" // check loop termination, proceed if required |
1252 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1253 |
"emms \n\t" // exit MMX state |
1253 |
"emms \n\t" // exit MMX state |
1254 |
"popa \n\t":"=m" (Dest) // %0 |
1254 |
"popa \n\t":"=m" (Dest) // %0 |
1255 |
:"m"(Src1), // %1 |
1255 |
:"m"(Src1), // %1 |
Lines 1322-1334
Link Here
|
1322 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1322 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1323 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1323 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1324 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1324 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1325 |
".L11024: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1325 |
"1: movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1326 |
"psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation) |
1326 |
"psubusb %%mm1, %%mm0 \n\t" // MM0=SrcDest-C (sub 8 bytes with saturation) |
1327 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1327 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1328 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1328 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1329 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1329 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1330 |
"dec %%ecx \n\t" // decrease loop counter |
1330 |
"dec %%ecx \n\t" // decrease loop counter |
1331 |
"jnz .L11024 \n\t" // check loop termination, proceed if required |
1331 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1332 |
"emms \n\t" // exit MMX state |
1332 |
"emms \n\t" // exit MMX state |
1333 |
"popa \n\t":"=m" (Dest) // %0 |
1333 |
"popa \n\t":"=m" (Dest) // %0 |
1334 |
:"m"(Src1), // %1 |
1334 |
:"m"(Src1), // %1 |
Lines 1405-1423
Link Here
|
1405 |
"mov %3, %%cl \n\t" // load loop counter (N) into CL |
1405 |
"mov %3, %%cl \n\t" // load loop counter (N) into CL |
1406 |
"movd %%ecx, %%mm3 \n\t" // copy (N) into MM3 |
1406 |
"movd %%ecx, %%mm3 \n\t" // copy (N) into MM3 |
1407 |
"pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1 |
1407 |
"pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1 |
1408 |
".L10240: \n\t" // ** Prepare proper bit-Mask in MM1 ** |
1408 |
"1: \n\t" // ** Prepare proper bit-Mask in MM1 ** |
1409 |
"psrlw $1, %%mm1 \n\t" // shift 4 WORDS of MM1 1 bit to the right |
1409 |
"psrlw $1, %%mm1 \n\t" // shift 4 WORDS of MM1 1 bit to the right |
1410 |
// "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 |
1410 |
// "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 |
1411 |
".byte 0x0f, 0xdb, 0xc8 \n\t" |
1411 |
".byte 0x0f, 0xdb, 0xc8 \n\t" |
1412 |
"dec %%cl \n\t" // decrease loop counter |
1412 |
"dec %%cl \n\t" // decrease loop counter |
1413 |
"jnz .L10240 \n\t" // check loop termination, proceed if required |
1413 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1414 |
// ** Shift all bytes of the image ** |
1414 |
// ** Shift all bytes of the image ** |
1415 |
"mov %1, %%eax \n\t" // load Src1 address into eax |
1415 |
"mov %1, %%eax \n\t" // load Src1 address into eax |
1416 |
"mov %0, %%edi \n\t" // load Dest address into edi |
1416 |
"mov %0, %%edi \n\t" // load Dest address into edi |
1417 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1417 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1418 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1418 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1419 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1419 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1420 |
".L10241: \n\t" |
1420 |
"2: \n\t" |
1421 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1421 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1422 |
"psrlw %%mm3, %%mm0 \n\t" // shift 4 WORDS of MM0 (N) bits to the right |
1422 |
"psrlw %%mm3, %%mm0 \n\t" // shift 4 WORDS of MM0 (N) bits to the right |
1423 |
// "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 |
1423 |
// "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 |
Lines 1426-1432
Link Here
|
1426 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1426 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1427 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1427 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1428 |
"dec %%ecx \n\t" // decrease loop counter |
1428 |
"dec %%ecx \n\t" // decrease loop counter |
1429 |
"jnz .L10241 \n\t" // check loop termination, proceed if required |
1429 |
"jnz 2b \n\t" // check loop termination, proceed if required |
1430 |
"emms \n\t" // exit MMX state |
1430 |
"emms \n\t" // exit MMX state |
1431 |
"popa \n\t":"=m" (Dest) // %0 |
1431 |
"popa \n\t":"=m" (Dest) // %0 |
1432 |
:"m"(Src1), // %1 |
1432 |
:"m"(Src1), // %1 |
Lines 1495-1507
Link Here
|
1495 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1495 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1496 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1496 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1497 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1497 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1498 |
".L13023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1498 |
"1: movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1499 |
"psrld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
1499 |
"psrld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
1500 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1500 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1501 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1501 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1502 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1502 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1503 |
"dec %%ecx \n\t" // decrease loop counter |
1503 |
"dec %%ecx \n\t" // decrease loop counter |
1504 |
"jnz .L13023 \n\t" // check loop termination, proceed if required |
1504 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1505 |
"emms \n\t" // exit MMX state |
1505 |
"emms \n\t" // exit MMX state |
1506 |
"popa \n\t":"=m" (Dest) // %0 |
1506 |
"popa \n\t":"=m" (Dest) // %0 |
1507 |
:"m"(Src1), // %1 |
1507 |
:"m"(Src1), // %1 |
Lines 1581-1588
Link Here
|
1581 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1581 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1582 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1582 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1583 |
"cmp $128, %%al \n\t" // if (C <= 128) execute more efficient code |
1583 |
"cmp $128, %%al \n\t" // if (C <= 128) execute more efficient code |
1584 |
"jg .L10251 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1584 |
"jg 2f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1585 |
".L10250: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1585 |
"1: movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1586 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1586 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1587 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1587 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1588 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
1588 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
Lines 1593-1601
Link Here
|
1593 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1593 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1594 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1594 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1595 |
"dec %%ecx \n\t" // decrease loop counter |
1595 |
"dec %%ecx \n\t" // decrease loop counter |
1596 |
"jnz .L10250 \n\t" // check loop termination, proceed if required |
1596 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1597 |
"jmp .L10252 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1597 |
"jmp 3f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1598 |
".L10251: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1598 |
"2: movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1599 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1599 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1600 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1600 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1601 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
1601 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
Lines 1615-1622
Link Here
|
1615 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1615 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1616 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1616 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1617 |
"dec %%ecx \n\t" // decrease loop counter |
1617 |
"dec %%ecx \n\t" // decrease loop counter |
1618 |
"jnz .L10251 \n\t" // check loop termination, proceed if required |
1618 |
"jnz 2b \n\t" // check loop termination, proceed if required |
1619 |
".L10252: \n\t" "emms \n\t" // exit MMX state |
1619 |
"3: emms \n\t" // exit MMX state |
1620 |
"popa \n\t":"=m" (Dest) // %0 |
1620 |
"popa \n\t":"=m" (Dest) // %0 |
1621 |
:"m"(Src1), // %1 |
1621 |
:"m"(Src1), // %1 |
1622 |
"m"(length), // %2 |
1622 |
"m"(length), // %2 |
Lines 1695-1702
Link Here
|
1695 |
"mov %0, %%edi \n\t" // load Dest address into edi |
1695 |
"mov %0, %%edi \n\t" // load Dest address into edi |
1696 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1696 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1697 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1697 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1698 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1698 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1699 |
".L1026: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1699 |
"1: movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1700 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1700 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1701 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1701 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1702 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
1702 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
Lines 1709-1715
Link Here
|
1709 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1709 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1710 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1710 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1711 |
"dec %%ecx \n\t" // decrease loop counter |
1711 |
"dec %%ecx \n\t" // decrease loop counter |
1712 |
"jnz .L1026 \n\t" // check loop termination, proceed if required |
1712 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1713 |
"emms \n\t" // exit MMX state |
1713 |
"emms \n\t" // exit MMX state |
1714 |
"popa \n\t":"=m" (Dest) // %0 |
1714 |
"popa \n\t":"=m" (Dest) // %0 |
1715 |
:"m"(Src1), // %1 |
1715 |
:"m"(Src1), // %1 |
Lines 1784-1808
Link Here
|
1784 |
"mov %3, %%cl \n\t" // load loop counter (N) into CL |
1784 |
"mov %3, %%cl \n\t" // load loop counter (N) into CL |
1785 |
"movd %%ecx, %%mm3 \n\t" // copy (N) into MM3 |
1785 |
"movd %%ecx, %%mm3 \n\t" // copy (N) into MM3 |
1786 |
"pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1 |
1786 |
"pcmpeqb %%mm1, %%mm1 \n\t" // generate all 1's in mm1 |
1787 |
".L10270: \n\t" // ** Prepare proper bit-Mask in MM1 ** |
1787 |
"1: \n\t" // ** Prepare proper bit-Mask in MM1 ** |
1788 |
"psllw $1, %%mm1 \n\t" // shift 4 WORDS of MM1 1 bit to the left |
1788 |
"psllw $1, %%mm1 \n\t" // shift 4 WORDS of MM1 1 bit to the left |
1789 |
// "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 |
1789 |
// "pand %%mm0, %%mm1 \n\t" // apply Mask to 8 BYTES of MM1 |
1790 |
".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" // decrease loop counter |
1790 |
".byte 0x0f, 0xdb, 0xc8 \n\t" "dec %%cl \n\t" // decrease loop counter |
1791 |
"jnz .L10270 \n\t" // check loop termination, proceed if required |
1791 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1792 |
// ** Shift all bytes of the image ** |
1792 |
// ** Shift all bytes of the image ** |
1793 |
"mov %1, %%eax \n\t" // load Src1 address into eax |
1793 |
"mov %1, %%eax \n\t" // load Src1 address into eax |
1794 |
"mov %0, %%edi \n\t" // load SrcDest address into edi |
1794 |
"mov %0, %%edi \n\t" // load SrcDest address into edi |
1795 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1795 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1796 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1796 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1797 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1797 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1798 |
".L10271: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0 |
1798 |
"2: movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0 |
1799 |
"psllw %%mm3, %%mm0 \n\t" // shift 4 WORDS of MM0 (N) bits to the left |
1799 |
"psllw %%mm3, %%mm0 \n\t" // shift 4 WORDS of MM0 (N) bits to the left |
1800 |
// "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 |
1800 |
// "pand %%mm1, %%mm0 \n\t" // apply proper bit-Mask to 8 BYTES of MM0 |
1801 |
".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" // store result in Dest |
1801 |
".byte 0x0f, 0xdb, 0xc1 \n\t" "movq %%mm0, (%%edi) \n\t" // store result in Dest |
1802 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1802 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1803 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1803 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1804 |
"dec %%ecx \n\t" // decrease loop counter |
1804 |
"dec %%ecx \n\t" // decrease loop counter |
1805 |
"jnz .L10271 \n\t" // check loop termination, proceed if required |
1805 |
"jnz 2b \n\t" // check loop termination, proceed if required |
1806 |
"emms \n\t" // exit MMX state |
1806 |
"emms \n\t" // exit MMX state |
1807 |
"popa \n\t":"=m" (Dest) // %0 |
1807 |
"popa \n\t":"=m" (Dest) // %0 |
1808 |
:"m"(Src1), // %1 |
1808 |
:"m"(Src1), // %1 |
Lines 1870-1882
Link Here
|
1870 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1870 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1871 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1871 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1872 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1872 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
1873 |
".L12023: \n\t" "movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1873 |
"1: movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
1874 |
"pslld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
1874 |
"pslld %3, %%mm0 \n\t" // MM0=SrcDest+C (add 8 bytes with saturation) |
1875 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1875 |
"movq %%mm0, (%%edi) \n\t" // store result in SrcDest |
1876 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1876 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1877 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1877 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1878 |
"dec %%ecx \n\t" // decrease loop counter |
1878 |
"dec %%ecx \n\t" // decrease loop counter |
1879 |
"jnz .L12023 \n\t" // check loop termination, proceed if required |
1879 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1880 |
"emms \n\t" // exit MMX state |
1880 |
"emms \n\t" // exit MMX state |
1881 |
"popa \n\t":"=m" (Dest) // %0 |
1881 |
"popa \n\t":"=m" (Dest) // %0 |
1882 |
:"m"(Src1), // %1 |
1882 |
:"m"(Src1), // %1 |
Lines 1949-1956
Link Here
|
1949 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1949 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
1950 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1950 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
1951 |
"cmp $7, %%al \n\t" // if (N <= 7) execute more efficient code |
1951 |
"cmp $7, %%al \n\t" // if (N <= 7) execute more efficient code |
1952 |
"jg .L10281 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1952 |
"jg 2f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1953 |
".L10280: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1953 |
"1: movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1954 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1954 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1955 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1955 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1956 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
1956 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
Lines 1961-1969
Link Here
|
1961 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1961 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1962 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1962 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1963 |
"dec %%ecx \n\t" // decrease loop counter |
1963 |
"dec %%ecx \n\t" // decrease loop counter |
1964 |
"jnz .L10280 \n\t" // check loop termination, proceed if required |
1964 |
"jnz 1b \n\t" // check loop termination, proceed if required |
1965 |
"jmp .L10282 \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1965 |
"jmp 3f \n\t" ".align 16 \n\t" // 16 byte allignment of the loop entry |
1966 |
".L10281: \n\t" "movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1966 |
"2: movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
1967 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1967 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
1968 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1968 |
"punpcklbw %%mm0, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
1969 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
1969 |
"punpckhbw %%mm0, %%mm4 \n\t" // unpack high bytes of SrcDest into words |
Lines 1983-1990
Link Here
|
1983 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1983 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
1984 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1984 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
1985 |
"dec %%ecx \n\t" // decrease loop counter |
1985 |
"dec %%ecx \n\t" // decrease loop counter |
1986 |
"jnz .L10281 \n\t" // check loop termination, proceed if required |
1986 |
"jnz 2b \n\t" // check loop termination, proceed if required |
1987 |
".L10282: \n\t" "emms \n\t" // exit MMX state |
1987 |
"3: emms \n\t" // exit MMX state |
1988 |
"popa \n\t":"=m" (Dest) // %0 |
1988 |
"popa \n\t":"=m" (Dest) // %0 |
1989 |
:"m"(Src1), // %1 |
1989 |
:"m"(Src1), // %1 |
1990 |
"m"(length), // %2 |
1990 |
"m"(length), // %2 |
Lines 2063-2069
Link Here
|
2063 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
2063 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
2064 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
2064 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
2065 |
".align 16 \n\t" // 16 byte alignment of the loop entry |
2065 |
".align 16 \n\t" // 16 byte alignment of the loop entry |
2066 |
".L1029: \n\t" |
2066 |
"1: \n\t" |
2067 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
2067 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from SrcDest into MM0 |
2068 |
"paddusb %%mm2, %%mm0 \n\t" // MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) |
2068 |
"paddusb %%mm2, %%mm0 \n\t" // MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation) |
2069 |
"pcmpeqb %%mm1, %%mm0 \n\t" // binarize 255:0, comparing to 255 |
2069 |
"pcmpeqb %%mm1, %%mm0 \n\t" // binarize 255:0, comparing to 255 |
Lines 2071-2077
Link Here
|
2071 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
2071 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
2072 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
2072 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
2073 |
"dec %%ecx \n\t" // decrease loop counter |
2073 |
"dec %%ecx \n\t" // decrease loop counter |
2074 |
"jnz .L1029 \n\t" // check loop termination, proceed if required |
2074 |
"jnz 1b \n\t" // check loop termination, proceed if required |
2075 |
"emms \n\t" // exit MMX state |
2075 |
"emms \n\t" // exit MMX state |
2076 |
"popa \n\t":"=m" (Dest) // %0 |
2076 |
"popa \n\t":"=m" (Dest) // %0 |
2077 |
:"m"(Src1), // %1 |
2077 |
:"m"(Src1), // %1 |
Lines 2154-2160
Link Here
|
2154 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
2154 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
2155 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
2155 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
2156 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
2156 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
2157 |
".L1030: \n\t" |
2157 |
"1: \n\t" |
2158 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0 |
2158 |
"movq (%%eax), %%mm0 \n\t" // load 8 bytes from Src1 into MM0 |
2159 |
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+(0xFF-Tmax) |
2159 |
"paddusb %%mm1, %%mm0 \n\t" // MM0=SrcDest+(0xFF-Tmax) |
2160 |
"psubusb %%mm7, %%mm0 \n\t" // MM0=MM0-(0xFF-Tmax+Tmin) |
2160 |
"psubusb %%mm7, %%mm0 \n\t" // MM0=MM0-(0xFF-Tmax+Tmin) |
Lines 2163-2169
Link Here
|
2163 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
2163 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
2164 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
2164 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
2165 |
"dec %%ecx \n\t" // decrease loop counter |
2165 |
"dec %%ecx \n\t" // decrease loop counter |
2166 |
"jnz .L1030 \n\t" // check loop termination, proceed if required |
2166 |
"jnz 1b \n\t" // check loop termination, proceed if required |
2167 |
"emms \n\t" // exit MMX state |
2167 |
"emms \n\t" // exit MMX state |
2168 |
"popa \n\t":"=m" (Dest) // %0 |
2168 |
"popa \n\t":"=m" (Dest) // %0 |
2169 |
:"m"(Src1), // %1 |
2169 |
:"m"(Src1), // %1 |
Lines 2231-2241
Link Here
|
2231 |
"mov %4, %%bx \n\t" // load Cmax in BX |
2231 |
"mov %4, %%bx \n\t" // load Cmax in BX |
2232 |
"sub %5, %%ax \n\t" // AX = Nmax - Nmin |
2232 |
"sub %5, %%ax \n\t" // AX = Nmax - Nmin |
2233 |
"sub %3, %%bx \n\t" // BX = Cmax - Cmin |
2233 |
"sub %3, %%bx \n\t" // BX = Cmax - Cmin |
2234 |
"jz .L10311 \n\t" // check division by zero |
2234 |
"jz 1f \n\t" // check division by zero |
2235 |
"xor %%dx, %%dx \n\t" // prepare for division, zero DX |
2235 |
"xor %%dx, %%dx \n\t" // prepare for division, zero DX |
2236 |
"div %%bx \n\t" // AX = AX/BX |
2236 |
"div %%bx \n\t" // AX = AX/BX |
2237 |
"jmp .L10312 \n\t" ".L10311: \n\t" "mov $255, %%ax \n\t" // if div by zero, assume result max. byte value |
2237 |
"jmp 2f \n\t" "1: \n\t" "mov $255, %%ax \n\t" // if div by zero, assume result max. byte value |
2238 |
".L10312: \n\t" // ** Duplicate AX in 4 words of MM0 ** |
2238 |
"2: \n\t" // ** Duplicate AX in 4 words of MM0 ** |
2239 |
"mov %%ax, %%bx \n\t" // copy AX into BX |
2239 |
"mov %%ax, %%bx \n\t" // copy AX into BX |
2240 |
"shl $16, %%eax \n\t" // shift 2 bytes of EAX left |
2240 |
"shl $16, %%eax \n\t" // shift 2 bytes of EAX left |
2241 |
"mov %%bx, %%ax \n\t" // copy BX into AX |
2241 |
"mov %%bx, %%ax \n\t" // copy BX into AX |
Lines 2264-2270
Link Here
|
2264 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
2264 |
"mov %2, %%ecx \n\t" // load loop counter (SIZE) into ecx |
2265 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
2265 |
"shr $3, %%ecx \n\t" // counter/8 (MMX loads 8 bytes at a time) |
2266 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
2266 |
".align 16 \n\t" // 16 byte allignment of the loop entry |
2267 |
".L1031: \n\t" |
2267 |
"1: \n\t" |
2268 |
"movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
2268 |
"movq (%%eax), %%mm3 \n\t" // load 8 bytes from Src1 into MM3 |
2269 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
2269 |
"movq %%mm3, %%mm4 \n\t" // copy MM3 into MM4 |
2270 |
"punpcklbw %%mm7, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
2270 |
"punpcklbw %%mm7, %%mm3 \n\t" // unpack low bytes of SrcDest into words |
Lines 2289-2295
Link Here
|
2289 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
2289 |
"add $8, %%eax \n\t" // increase Src1 register pointer by 8 |
2290 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
2290 |
"add $8, %%edi \n\t" // increase Dest register pointer by 8 |
2291 |
"dec %%ecx \n\t" // decrease loop counter |
2291 |
"dec %%ecx \n\t" // decrease loop counter |
2292 |
"jnz .L1031 \n\t" // check loop termination, proceed if required |
2292 |
"jnz 1b \n\t" // check loop termination, proceed if required |
2293 |
"emms \n\t" // exit MMX state |
2293 |
"emms \n\t" // exit MMX state |
2294 |
"popa \n\t":"=m" (Dest) // %0 |
2294 |
"popa \n\t":"=m" (Dest) // %0 |
2295 |
:"m"(Src1), // %1 |
2295 |
:"m"(Src1), // %1 |