Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 219621 | Differences between
and this patch

Collapse All | Expand All

(-)SDL_gfx-2.0.13.orig/SDL_imageFilter.c (-76 / +76 lines)
Lines 81-93 Link Here
81
       "mov          %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
81
       "mov          %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
82
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
82
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
83
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
83
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
84
      ".L1010:                \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
84
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
85
      "paddusb (%%ebx), %%mm1 \n\t"	// mm1=Src1+Src2 (add 8 bytes with saturation)
85
      "paddusb (%%ebx), %%mm1 \n\t"	// mm1=Src1+Src2 (add 8 bytes with saturation)
86
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
86
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
87
       "add          $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
87
       "add          $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
88
      "add          $8, %%ebx \n\t"	// register pointers by 8
88
      "add          $8, %%ebx \n\t"	// register pointers by 8
89
      "add          $8, %%edi \n\t" "dec              %%ecx \n\t"	// decrease loop counter
89
      "add          $8, %%edi \n\t" "dec              %%ecx \n\t"	// decrease loop counter
90
      "jnz             .L1010 \n\t"	// check loop termination, proceed if required
90
      "jnz             1b     \n\t"	// check loop termination, proceed if required
91
       "emms                   \n\t"	// exit MMX state
91
       "emms                   \n\t"	// exit MMX state
92
      "popa                   \n\t":"=m" (Dest)	// %0
92
      "popa                   \n\t":"=m" (Dest)	// %0
93
      :"m"(Src2),		// %1
93
      :"m"(Src2),		// %1
Lines 158-164 Link Here
158
       "mov          %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
158
       "mov          %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
159
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
159
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
160
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
160
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
161
      ".L21011:                \n\t" 
161
      "1:                      \n\t" 
162
      "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
162
      "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
163
      "movq    (%%ebx), %%mm2 \n\t"	// load 8 bytes from Src2 into mm2
163
      "movq    (%%ebx), %%mm2 \n\t"	// load 8 bytes from Src2 into mm2
164
      // --- Byte shift via Word shift ---
164
      // --- Byte shift via Word shift ---
Lines 174-180 Link Here
174
      "add          $8, %%ebx \n\t"	// register pointers by 8
174
      "add          $8, %%ebx \n\t"	// register pointers by 8
175
      "add          $8, %%edi \n\t" 
175
      "add          $8, %%edi \n\t" 
176
      "dec              %%ecx \n\t"	// decrease loop counter
176
      "dec              %%ecx \n\t"	// decrease loop counter
177
      "jnz             .L21011 \n\t"	// check loop termination, proceed if required
177
      "jnz                 1b \n\t"	// check loop termination, proceed if required
178
       "emms                   \n\t"	// exit MMX state
178
       "emms                   \n\t"	// exit MMX state
179
      "popa                   \n\t":"=m" (Dest)	// %0
179
      "popa                   \n\t":"=m" (Dest)	// %0
180
      :"m"(Src2),		// %1
180
      :"m"(Src2),		// %1
Lines 241-253 Link Here
241
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
241
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
242
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
242
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
243
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
243
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
244
      ".L1012:         \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
244
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
245
      "psubusb (%%ebx), %%mm1 \n\t"	// mm1=Src1-Src2 (sub 8 bytes with saturation)
245
      "psubusb (%%ebx), %%mm1 \n\t"	// mm1=Src1-Src2 (sub 8 bytes with saturation)
246
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
246
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
247
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
247
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
248
      "add $8, %%ebx \n\t"	// register pointers by 8
248
      "add $8, %%ebx \n\t"	// register pointers by 8
249
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
249
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
250
      "jnz .L1012    \n\t"	// check loop termination, proceed if required
250
      "jnz 1b         \n\t"	// check loop termination, proceed if required
251
       "emms          \n\t"	// exit MMX state
251
       "emms          \n\t"	// exit MMX state
252
      "popa                   \n\t":"=m" (Dest)	// %0
252
      "popa                   \n\t":"=m" (Dest)	// %0
253
      :"m"(Src2),		// %1
253
      :"m"(Src2),		// %1
Lines 313-319 Link Here
313
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
313
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
314
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
314
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
315
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
315
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
316
      ".L1013:         \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
316
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
317
      "movq    (%%ebx), %%mm2 \n\t"	// load 8 bytes from Src2 into mm2
317
      "movq    (%%ebx), %%mm2 \n\t"	// load 8 bytes from Src2 into mm2
318
      "psubusb (%%ebx), %%mm1 \n\t"	// mm1=Src1-Src2 (sub 8 bytes with saturation)
318
      "psubusb (%%ebx), %%mm1 \n\t"	// mm1=Src1-Src2 (sub 8 bytes with saturation)
319
      "psubusb (%%eax), %%mm2 \n\t"	// mm2=Src2-Src1 (sub 8 bytes with saturation)
319
      "psubusb (%%eax), %%mm2 \n\t"	// mm2=Src2-Src1 (sub 8 bytes with saturation)
Lines 322-328 Link Here
322
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
322
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
323
      "add $8, %%ebx \n\t"	// register pointers by 8
323
      "add $8, %%ebx \n\t"	// register pointers by 8
324
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
324
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
325
      "jnz .L1013    \n\t"	// check loop termination, proceed if required
325
      "jnz 1b        \n\t"	// check loop termination, proceed if required
326
       "emms          \n\t"	// exit MMX state
326
       "emms          \n\t"	// exit MMX state
327
      "popa                   \n\t":"=m" (Dest)	// %0
327
      "popa                   \n\t":"=m" (Dest)	// %0
328
      :"m"(Src2),		// %1
328
      :"m"(Src2),		// %1
Lines 388-394 Link Here
388
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
388
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
389
       "pxor      %%mm0, %%mm0 \n\t"	// zero mm0 register
389
       "pxor      %%mm0, %%mm0 \n\t"	// zero mm0 register
390
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
390
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
391
      ".L1014:         \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
391
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
392
      "movq    (%%ebx), %%mm3 \n\t"	// load 8 bytes from Src2 into mm3
392
      "movq    (%%ebx), %%mm3 \n\t"	// load 8 bytes from Src2 into mm3
393
      "movq      %%mm1, %%mm2 \n\t"	// copy mm1 into mm2
393
      "movq      %%mm1, %%mm2 \n\t"	// copy mm1 into mm2
394
      "movq      %%mm3, %%mm4 \n\t"	// copy mm3 into mm4 
394
      "movq      %%mm3, %%mm4 \n\t"	// copy mm3 into mm4 
Lines 412-418 Link Here
412
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
412
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
413
      "add $8, %%ebx \n\t"	// register pointers by 8
413
      "add $8, %%ebx \n\t"	// register pointers by 8
414
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
414
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
415
      "jnz .L1014    \n\t"	// check loop termination, proceed if required
415
      "jnz 1b        \n\t"	// check loop termination, proceed if required
416
       "emms          \n\t"	// exit MMX state
416
       "emms          \n\t"	// exit MMX state
417
      "popa \n\t":"=m" (Dest)	// %0
417
      "popa \n\t":"=m" (Dest)	// %0
418
      :"m"(Src2),		// %1
418
      :"m"(Src2),		// %1
Lines 481-493 Link Here
481
      "mov %0, %%edi \n\t"	// load Dest address into edi
481
      "mov %0, %%edi \n\t"	// load Dest address into edi
482
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
482
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
483
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
483
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
484
      ".L10141:        \n\t" "mov  (%%edx), %%al \n\t"	// load a byte from Src1
484
      "1:mov  (%%edx), %%al \n\t"	// load a byte from Src1
485
      "mulb (%%esi)       \n\t"	// mul with a byte from Src2
485
      "mulb (%%esi)       \n\t"	// mul with a byte from Src2
486
       ".L10142:           \n\t" "mov %%al, (%%edi)  \n\t"	// move a byte result to Dest
486
       "mov %%al, (%%edi)  \n\t"	// move a byte result to Dest
487
       "inc %%edx \n\t"		// increment Src1, Src2, Dest
487
       "inc %%edx \n\t"		// increment Src1, Src2, Dest
488
      "inc %%esi \n\t"		// pointer registers by one
488
      "inc %%esi \n\t"		// pointer registers by one
489
      "inc %%edi \n\t" "dec %%ecx      \n\t"	// decrease loop counter
489
      "inc %%edi \n\t" "dec %%ecx      \n\t"	// decrease loop counter
490
      "jnz .L10141    \n\t"	// check loop termination, proceed if required
490
      "jnz 1b         \n\t"	// check loop termination, proceed if required
491
       "popa                   \n\t":"=m" (Dest)	// %0
491
       "popa                   \n\t":"=m" (Dest)	// %0
492
      :"m"(Src2),		// %1
492
      :"m"(Src2),		// %1
493
      "m"(Src1),		// %2
493
      "m"(Src1),		// %2
Lines 549-555 Link Here
549
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
549
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
550
       "pxor      %%mm0, %%mm0 \n\t"	// zero mm0 register
550
       "pxor      %%mm0, %%mm0 \n\t"	// zero mm0 register
551
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
551
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
552
      ".L1015:         \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
552
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
553
      "movq    (%%ebx), %%mm3 \n\t"	// load 8 bytes from Src2 into mm3
553
      "movq    (%%ebx), %%mm3 \n\t"	// load 8 bytes from Src2 into mm3
554
      "movq      %%mm1, %%mm2 \n\t"	// copy mm1 into mm2
554
      "movq      %%mm1, %%mm2 \n\t"	// copy mm1 into mm2
555
      "movq      %%mm3, %%mm4 \n\t"	// copy mm3 into mm4 
555
      "movq      %%mm3, %%mm4 \n\t"	// copy mm3 into mm4 
Lines 566-572 Link Here
566
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
566
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
567
      "add $8, %%ebx \n\t"	// register pointers by 8
567
      "add $8, %%ebx \n\t"	// register pointers by 8
568
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
568
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
569
      "jnz .L1015    \n\t"	// check loop termination, proceed if required
569
      "jnz 1b        \n\t"	// check loop termination, proceed if required
570
       "emms          \n\t"	// exit MMX state
570
       "emms          \n\t"	// exit MMX state
571
      "popa \n\t":"=m" (Dest)	// %0
571
      "popa \n\t":"=m" (Dest)	// %0
572
      :"m"(Src2),		// %1
572
      :"m"(Src2),		// %1
Lines 634-640 Link Here
634
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
634
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
635
       "pxor      %%mm0, %%mm0 \n\t"	// zero mm0 register
635
       "pxor      %%mm0, %%mm0 \n\t"	// zero mm0 register
636
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
636
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
637
      ".L1016:         \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
637
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
638
      "movq    (%%ebx), %%mm3 \n\t"	// load 8 bytes from Src2 into mm3
638
      "movq    (%%ebx), %%mm3 \n\t"	// load 8 bytes from Src2 into mm3
639
      "movq      %%mm1, %%mm2 \n\t"	// copy mm1 into mm2
639
      "movq      %%mm1, %%mm2 \n\t"	// copy mm1 into mm2
640
      "movq      %%mm3, %%mm4 \n\t"	// copy mm3 into mm4 
640
      "movq      %%mm3, %%mm4 \n\t"	// copy mm3 into mm4 
Lines 653-659 Link Here
653
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
653
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
654
      "add $8, %%ebx \n\t"	// register pointers by 8
654
      "add $8, %%ebx \n\t"	// register pointers by 8
655
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
655
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
656
      "jnz .L1016    \n\t"	// check loop termination, proceed if required
656
      "jnz 1b        \n\t"	// check loop termination, proceed if required
657
       "emms          \n\t"	// exit MMX state
657
       "emms          \n\t"	// exit MMX state
658
      "popa                   \n\t":"=m" (Dest)	// %0
658
      "popa                   \n\t":"=m" (Dest)	// %0
659
      :"m"(Src2),		// %1
659
      :"m"(Src2),		// %1
Lines 720-732 Link Here
720
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
720
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
721
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
721
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
722
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
722
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
723
      ".L1017:         \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
723
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
724
      "pand    (%%ebx), %%mm1 \n\t"	// mm1=Src1&Src2
724
      "pand    (%%ebx), %%mm1 \n\t"	// mm1=Src1&Src2
725
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
725
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
726
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
726
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
727
      "add $8, %%ebx \n\t"	// register pointers by 8
727
      "add $8, %%ebx \n\t"	// register pointers by 8
728
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
728
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
729
      "jnz .L1017    \n\t"	// check loop termination, proceed if required
729
      "jnz 1b        \n\t"	// check loop termination, proceed if required
730
       "emms          \n\t"	// exit MMX state
730
       "emms          \n\t"	// exit MMX state
731
      "popa                   \n\t":"=m" (Dest)	// %0
731
      "popa                   \n\t":"=m" (Dest)	// %0
732
      :"m"(Src2),		// %1
732
      :"m"(Src2),		// %1
Lines 792-804 Link Here
792
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
792
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
793
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
793
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
794
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
794
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
795
      ".L91017:        \n\t" "movq    (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
795
      "1: movq (%%eax), %%mm1 \n\t"	// load 8 bytes from Src1 into mm1
796
      "por     (%%ebx), %%mm1 \n\t"	// mm1=Src1|Src2
796
      "por     (%%ebx), %%mm1 \n\t"	// mm1=Src1|Src2
797
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
797
      "movq    %%mm1, (%%edi) \n\t"	// store result in Dest
798
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
798
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
799
      "add $8, %%ebx \n\t"	// register pointers by 8
799
      "add $8, %%ebx \n\t"	// register pointers by 8
800
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
800
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
801
      "jnz .L91017   \n\t"	// check loop termination, proceed if required
801
      "jnz 1b        \n\t"	// check loop termination, proceed if required
802
       "emms          \n\t"	// exit MMX state
802
       "emms          \n\t"	// exit MMX state
803
      "popa                   \n\t":"=m" (Dest)	// %0
803
      "popa                   \n\t":"=m" (Dest)	// %0
804
      :"m"(Src2),		// %1
804
      :"m"(Src2),		// %1
Lines 860-876 Link Here
860
      "mov %0, %%edi \n\t"	// load Dest address into edi
860
      "mov %0, %%edi \n\t"	// load Dest address into edi
861
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
861
       "mov %3, %%ecx \n\t"	// load loop counter (SIZE) into ecx
862
       ".align 16     \n\t"	// 16 byte allignment of the loop entry
862
       ".align 16     \n\t"	// 16 byte allignment of the loop entry
863
      ".L10191:      \n\t" "mov  (%%esi), %%bl  \n\t"	// load a byte from Src2
863
      "1: mov (%%esi), %%bl  \n\t"	// load a byte from Src2
864
      "cmp       $0, %%bl  \n\t"	// check if it zero
864
      "cmp       $0, %%bl  \n\t"	// check if it zero
865
      "jnz .L10192         \n\t" "movb  $255, (%%edi) \n\t"	// division by zero = 255 !!!
865
      "jnz 2f              \n\t" "movb  $255, (%%edi) \n\t"	// division by zero = 255 !!!
866
      "jmp  .L10193        \n\t" ".L10192:            \n\t" "xor   %%ah, %%ah    \n\t"	// prepare AX, zero AH register
866
      "jmp 3f              \n\t" "2:                  \n\t" "xor   %%ah, %%ah    \n\t"	// prepare AX, zero AH register
867
      "mov   (%%edx), %%al \n\t"	// load a byte from Src1 into AL
867
      "mov   (%%edx), %%al \n\t"	// load a byte from Src1 into AL
868
      "div   %%bl          \n\t"	// divide AL by BL
868
      "div   %%bl          \n\t"	// divide AL by BL
869
      "mov   %%al, (%%edi) \n\t"	// move a byte result to Dest
869
      "mov   %%al, (%%edi) \n\t"	// move a byte result to Dest
870
       ".L10193:            \n\t" "inc %%edx \n\t"	// increment Src1, Src2, Dest
870
      "3: inc %%edx        \n\t"	// increment Src1, Src2, Dest
871
      "inc %%esi \n\t"		// pointer registers by one
871
      "inc %%esi \n\t"		// pointer registers by one
872
      "inc %%edi \n\t" "dec %%ecx    \n\t"	// decrease loop counter
872
      "inc %%edi \n\t" "dec %%ecx    \n\t"	// decrease loop counter
873
      "jnz .L10191  \n\t"	// check loop termination, proceed if required
873
      "jnz 1b       \n\t"	// check loop termination, proceed if required
874
       "popa \n\t":"=m" (Dest)	// %0
874
       "popa \n\t":"=m" (Dest)	// %0
875
      :"m"(Src2),		// %1
875
      :"m"(Src2),		// %1
876
      "m"(Src1),		// %2
876
      "m"(Src1),		// %2
Lines 907-918 Link Here
907
       "mov %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
907
       "mov %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
908
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
908
      "shr $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
909
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
909
       ".align 16       \n\t"	// 16 byte allignment of the loop entry
910
      ".L91117:        \n\t" "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into mm1
910
      "1: movq (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into mm1
911
      "pxor      %%mm1, %%mm0 \n\t"	// negate mm0 by xoring with mm1
911
      "pxor      %%mm1, %%mm0 \n\t"	// negate mm0 by xoring with mm1
912
      "movq    %%mm0, (%%edi) \n\t"	// store result in Dest
912
      "movq    %%mm0, (%%edi) \n\t"	// store result in Dest
913
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
913
       "add $8, %%eax \n\t"	// increase Src1, Src2 and Dest 
914
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
914
      "add $8, %%edi \n\t" "dec %%ecx     \n\t"	// decrease loop counter
915
      "jnz .L91117   \n\t"	// check loop termination, proceed if required
915
      "jnz 1b        \n\t"	// check loop termination, proceed if required
916
       "emms          \n\t"	// exit MMX state
916
       "emms          \n\t"	// exit MMX state
917
      "popa                   \n\t":"=m" (Dest)	// %0
917
      "popa                   \n\t":"=m" (Dest)	// %0
918
      :"m"(Src1),		// %1
918
      :"m"(Src1),		// %1
Lines 980-993 Link Here
980
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
980
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
981
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
981
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
982
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
982
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
983
      ".L1021:                \n\t" 
983
      "1:                     \n\t" 
984
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into MM0
984
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into MM0
985
      "paddusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
985
      "paddusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
986
      "movq    %%mm0, (%%edi) \n\t"	// store result in Dest
986
      "movq    %%mm0, (%%edi) \n\t"	// store result in Dest
987
       "add          $8, %%eax \n\t"	// increase Dest register pointer by 8
987
       "add          $8, %%eax \n\t"	// increase Dest register pointer by 8
988
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
988
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
989
      "dec              %%ecx \n\t"	// decrease loop counter
989
      "dec              %%ecx \n\t"	// decrease loop counter
990
      "jnz             .L1021 \n\t"	// check loop termination, proceed if required
990
      "jnz                 1b \n\t"	// check loop termination, proceed if required
991
       "emms                   \n\t"	// exit MMX state
991
       "emms                   \n\t"	// exit MMX state
992
      "popa                   \n\t":"=m" (Dest)	// %0
992
      "popa                   \n\t":"=m" (Dest)	// %0
993
      :"m"(Src1),		// %1
993
      :"m"(Src1),		// %1
Lines 1059-1072 Link Here
1059
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1059
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1060
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1060
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1061
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1061
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1062
      ".L11023:                \n\t" 
1062
      "1:                     \n\t" 
1063
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1063
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1064
      "paddusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
1064
      "paddusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
1065
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1065
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1066
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1066
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1067
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1067
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1068
      "dec              %%ecx \n\t"	// decrease loop counter
1068
      "dec              %%ecx \n\t"	// decrease loop counter
1069
      "jnz             .L11023 \n\t"	// check loop termination, proceed if required
1069
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1070
       "emms                   \n\t"	// exit MMX state
1070
       "emms                   \n\t"	// exit MMX state
1071
      "popa                   \n\t":"=m" (Dest)	// %0
1071
      "popa                   \n\t":"=m" (Dest)	// %0
1072
      :"m"(Src1),		// %1
1072
      :"m"(Src1),		// %1
Lines 1154-1160 Link Here
1154
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1154
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1155
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1155
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1156
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1156
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1157
      ".L1022:                \n\t" 
1157
      "1:                     \n\t" 
1158
      "movq    (%%eax), %%mm2 \n\t"	// load 8 bytes from Src1 into MM2
1158
      "movq    (%%eax), %%mm2 \n\t"	// load 8 bytes from Src1 into MM2
1159
      "psrlw        $1, %%mm2 \n\t"	// shift 4 WORDS of MM2 1 bit to the right
1159
      "psrlw        $1, %%mm2 \n\t"	// shift 4 WORDS of MM2 1 bit to the right
1160
      //    "pand      %%mm0, %%mm2 \n\t"    // apply Mask to 8 BYTES of MM2
1160
      //    "pand      %%mm0, %%mm2 \n\t"    // apply Mask to 8 BYTES of MM2
Lines 1164-1170 Link Here
1164
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1164
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1165
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1165
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1166
      "dec              %%ecx \n\t"	// decrease loop counter
1166
      "dec              %%ecx \n\t"	// decrease loop counter
1167
      "jnz             .L1022 \n\t"	// check loop termination, proceed if required
1167
      "jnz                  1b \n\t"	// check loop termination, proceed if required
1168
       "emms                   \n\t"	// exit MMX state
1168
       "emms                   \n\t"	// exit MMX state
1169
      "popa                   \n\t":"=m" (Dest)	// %0
1169
      "popa                   \n\t":"=m" (Dest)	// %0
1170
      :"m"(Src1),		// %1
1170
      :"m"(Src1),		// %1
Lines 1243-1255 Link Here
1243
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1243
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1244
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1244
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1245
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1245
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1246
      ".L1023:                \n\t" "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1246
      "1: movq (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1247
      "psubusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest-C (sub 8 bytes with saturation)
1247
      "psubusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest-C (sub 8 bytes with saturation)
1248
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1248
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1249
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1249
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1250
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1250
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1251
      "dec              %%ecx \n\t"	// decrease loop counter
1251
      "dec              %%ecx \n\t"	// decrease loop counter
1252
      "jnz             .L1023 \n\t"	// check loop termination, proceed if required
1252
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1253
       "emms                   \n\t"	// exit MMX state
1253
       "emms                   \n\t"	// exit MMX state
1254
      "popa                   \n\t":"=m" (Dest)	// %0
1254
      "popa                   \n\t":"=m" (Dest)	// %0
1255
      :"m"(Src1),		// %1
1255
      :"m"(Src1),		// %1
Lines 1322-1334 Link Here
1322
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1322
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1323
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1323
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1324
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1324
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1325
      ".L11024:                \n\t" "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1325
      "1: movq (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1326
      "psubusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest-C (sub 8 bytes with saturation)
1326
      "psubusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest-C (sub 8 bytes with saturation)
1327
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1327
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1328
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1328
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1329
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1329
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1330
      "dec              %%ecx \n\t"	// decrease loop counter
1330
      "dec              %%ecx \n\t"	// decrease loop counter
1331
      "jnz             .L11024 \n\t"	// check loop termination, proceed if required
1331
      "jnz                  1b \n\t"	// check loop termination, proceed if required
1332
       "emms                   \n\t"	// exit MMX state
1332
       "emms                   \n\t"	// exit MMX state
1333
      "popa                   \n\t":"=m" (Dest)	// %0
1333
      "popa                   \n\t":"=m" (Dest)	// %0
1334
      :"m"(Src1),		// %1
1334
      :"m"(Src1),		// %1
Lines 1405-1423 Link Here
1405
      "mov           %3, %%cl \n\t"	// load loop counter (N) into CL
1405
      "mov           %3, %%cl \n\t"	// load loop counter (N) into CL
1406
      "movd      %%ecx, %%mm3 \n\t"	// copy (N) into MM3 
1406
      "movd      %%ecx, %%mm3 \n\t"	// copy (N) into MM3 
1407
       "pcmpeqb   %%mm1, %%mm1 \n\t"	// generate all 1's in mm1
1407
       "pcmpeqb   %%mm1, %%mm1 \n\t"	// generate all 1's in mm1
1408
       ".L10240:               \n\t"	// ** Prepare proper bit-Mask in MM1 **
1408
       "1:                     \n\t"	// ** Prepare proper bit-Mask in MM1 **
1409
       "psrlw        $1, %%mm1 \n\t"	// shift 4 WORDS of MM1 1 bit to the right
1409
       "psrlw        $1, %%mm1 \n\t"	// shift 4 WORDS of MM1 1 bit to the right
1410
      //    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1
1410
      //    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1
1411
      ".byte     0x0f, 0xdb, 0xc8 \n\t" 
1411
      ".byte     0x0f, 0xdb, 0xc8 \n\t" 
1412
      "dec               %%cl \n\t"	// decrease loop counter
1412
      "dec               %%cl \n\t"	// decrease loop counter
1413
      "jnz            .L10240 \n\t"	// check loop termination, proceed if required
1413
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1414
      // ** Shift all bytes of the image **
1414
      // ** Shift all bytes of the image **
1415
       "mov          %1, %%eax \n\t"	// load Src1 address into eax
1415
       "mov          %1, %%eax \n\t"	// load Src1 address into eax
1416
      "mov          %0, %%edi \n\t"	// load Dest address into edi
1416
      "mov          %0, %%edi \n\t"	// load Dest address into edi
1417
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1417
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1418
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1418
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1419
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1419
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1420
      ".L10241:               \n\t" 
1420
      "2:                     \n\t" 
1421
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1421
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1422
      "psrlw     %%mm3, %%mm0 \n\t"	// shift 4 WORDS of MM0 (N) bits to the right
1422
      "psrlw     %%mm3, %%mm0 \n\t"	// shift 4 WORDS of MM0 (N) bits to the right
1423
      //    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0
1423
      //    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0
Lines 1426-1432 Link Here
1426
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1426
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1427
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1427
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1428
      "dec              %%ecx \n\t"	// decrease loop counter
1428
      "dec              %%ecx \n\t"	// decrease loop counter
1429
      "jnz            .L10241 \n\t"	// check loop termination, proceed if required
1429
      "jnz                 2b \n\t"	// check loop termination, proceed if required
1430
       "emms                   \n\t"	// exit MMX state
1430
       "emms                   \n\t"	// exit MMX state
1431
      "popa                   \n\t":"=m" (Dest)	// %0
1431
      "popa                   \n\t":"=m" (Dest)	// %0
1432
      :"m"(Src1),		// %1
1432
      :"m"(Src1),		// %1
Lines 1495-1507 Link Here
1495
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1495
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1496
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1496
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1497
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1497
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1498
      ".L13023:                \n\t" "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1498
      "1: movq (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1499
      "psrld   %3, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
1499
      "psrld   %3, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
1500
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1500
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1501
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1501
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1502
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1502
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1503
      "dec              %%ecx \n\t"	// decrease loop counter
1503
      "dec              %%ecx \n\t"	// decrease loop counter
1504
      "jnz             .L13023 \n\t"	// check loop termination, proceed if required
1504
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1505
      "emms                   \n\t"	// exit MMX state
1505
      "emms                   \n\t"	// exit MMX state
1506
      "popa                   \n\t":"=m" (Dest)	// %0
1506
      "popa                   \n\t":"=m" (Dest)	// %0
1507
      :"m"(Src1),		// %1
1507
      :"m"(Src1),		// %1
Lines 1581-1588 Link Here
1581
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1581
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1582
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1582
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1583
       "cmp         $128, %%al \n\t"	// if (C <= 128) execute more efficient code
1583
       "cmp         $128, %%al \n\t"	// if (C <= 128) execute more efficient code
1584
      "jg             .L10251 \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1584
      "jg                  2f \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1585
      ".L10250:               \n\t" "movq    (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1585
      "1: movq (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1586
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1586
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1587
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1587
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1588
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
1588
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
Lines 1593-1601 Link Here
1593
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1593
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1594
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1594
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1595
      "dec              %%ecx \n\t"	// decrease loop counter
1595
      "dec              %%ecx \n\t"	// decrease loop counter
1596
      "jnz            .L10250 \n\t"	// check loop termination, proceed if required
1596
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1597
      "jmp            .L10252 \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1597
      "jmp                 3f \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1598
      ".L10251:               \n\t" "movq    (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1598
      "2: movq (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1599
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1599
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1600
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1600
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1601
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
1601
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
Lines 1615-1622 Link Here
1615
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1615
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1616
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1616
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1617
      "dec              %%ecx \n\t"	// decrease loop counter
1617
      "dec              %%ecx \n\t"	// decrease loop counter
1618
      "jnz            .L10251 \n\t"	// check loop termination, proceed if required
1618
      "jnz                 2b \n\t"	// check loop termination, proceed if required
1619
       ".L10252:               \n\t" "emms                   \n\t"	// exit MMX state
1619
       "3: emms               \n\t"	// exit MMX state
1620
      "popa                   \n\t":"=m" (Dest)	// %0
1620
      "popa                   \n\t":"=m" (Dest)	// %0
1621
      :"m"(Src1),		// %1
1621
      :"m"(Src1),		// %1
1622
      "m"(length),		// %2
1622
      "m"(length),		// %2
Lines 1695-1702 Link Here
1695
      "mov          %0, %%edi \n\t"	// load Dest address into edi
1695
      "mov          %0, %%edi \n\t"	// load Dest address into edi
1696
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1696
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1697
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1697
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1698
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1698
       ".align 16             \n\t"	// 16 byte allignment of the loop entry
1699
      ".L1026:                \n\t" "movq    (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1699
      "1: movq (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1700
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1700
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1701
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1701
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1702
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
1702
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
Lines 1709-1715 Link Here
1709
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1709
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1710
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1710
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1711
      "dec              %%ecx \n\t"	// decrease loop counter
1711
      "dec              %%ecx \n\t"	// decrease loop counter
1712
      "jnz             .L1026 \n\t"	// check loop termination, proceed if required
1712
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1713
       "emms                   \n\t"	// exit MMX state
1713
       "emms                   \n\t"	// exit MMX state
1714
      "popa                   \n\t":"=m" (Dest)	// %0
1714
      "popa                   \n\t":"=m" (Dest)	// %0
1715
      :"m"(Src1),		// %1
1715
      :"m"(Src1),		// %1
Lines 1784-1808 Link Here
1784
      "mov           %3, %%cl \n\t"	// load loop counter (N) into CL
1784
      "mov           %3, %%cl \n\t"	// load loop counter (N) into CL
1785
      "movd      %%ecx, %%mm3 \n\t"	// copy (N) into MM3 
1785
      "movd      %%ecx, %%mm3 \n\t"	// copy (N) into MM3 
1786
       "pcmpeqb   %%mm1, %%mm1 \n\t"	// generate all 1's in mm1
1786
       "pcmpeqb   %%mm1, %%mm1 \n\t"	// generate all 1's in mm1
1787
       ".L10270:               \n\t"	// ** Prepare proper bit-Mask in MM1 **
1787
       "1:                     \n\t"	// ** Prepare proper bit-Mask in MM1 **
1788
       "psllw        $1, %%mm1 \n\t"	// shift 4 WORDS of MM1 1 bit to the left
1788
       "psllw        $1, %%mm1 \n\t"	// shift 4 WORDS of MM1 1 bit to the left
1789
      //    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1
1789
      //    "pand      %%mm0, %%mm1 \n\t"    // apply Mask to 8 BYTES of MM1
1790
      ".byte     0x0f, 0xdb, 0xc8 \n\t" "dec %%cl               \n\t"	// decrease loop counter
1790
      ".byte     0x0f, 0xdb, 0xc8 \n\t" "dec %%cl               \n\t"	// decrease loop counter
1791
      "jnz            .L10270 \n\t"	// check loop termination, proceed if required
1791
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1792
      // ** Shift all bytes of the image **
1792
      // ** Shift all bytes of the image **
1793
       "mov          %1, %%eax \n\t"	// load Src1 address into eax
1793
       "mov          %1, %%eax \n\t"	// load Src1 address into eax
1794
      "mov          %0, %%edi \n\t"	// load SrcDest address into edi
1794
      "mov          %0, %%edi \n\t"	// load SrcDest address into edi
1795
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1795
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1796
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1796
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1797
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1797
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1798
      ".L10271:               \n\t" "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into MM0
1798
      "2: movq (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into MM0
1799
      "psllw     %%mm3, %%mm0 \n\t"	// shift 4 WORDS of MM0 (N) bits to the left
1799
      "psllw     %%mm3, %%mm0 \n\t"	// shift 4 WORDS of MM0 (N) bits to the left
1800
      //    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0
1800
      //    "pand      %%mm1, %%mm0 \n\t"    // apply proper bit-Mask to 8 BYTES of MM0
1801
      ".byte     0x0f, 0xdb, 0xc1 \n\t" "movq    %%mm0, (%%edi) \n\t"	// store result in Dest
1801
      ".byte     0x0f, 0xdb, 0xc1 \n\t" "movq    %%mm0, (%%edi) \n\t"	// store result in Dest
1802
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1802
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1803
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1803
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1804
      "dec              %%ecx \n\t"	// decrease loop counter
1804
      "dec              %%ecx \n\t"	// decrease loop counter
1805
      "jnz            .L10271 \n\t"	// check loop termination, proceed if required
1805
      "jnz                 2b \n\t"	// check loop termination, proceed if required
1806
       "emms                   \n\t"	// exit MMX state
1806
       "emms                   \n\t"	// exit MMX state
1807
      "popa                   \n\t":"=m" (Dest)	// %0
1807
      "popa                   \n\t":"=m" (Dest)	// %0
1808
      :"m"(Src1),		// %1
1808
      :"m"(Src1),		// %1
Lines 1870-1882 Link Here
1870
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1870
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
1871
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1871
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1872
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1872
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
1873
      ".L12023:                \n\t" "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1873
      "1: movq (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
1874
      "pslld   %3, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
1874
      "pslld   %3, %%mm0 \n\t"	// MM0=SrcDest+C (add 8 bytes with saturation)
1875
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1875
      "movq    %%mm0, (%%edi) \n\t"	// store result in SrcDest
1876
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1876
      "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1877
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1877
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1878
      "dec              %%ecx \n\t"	// decrease loop counter
1878
      "dec              %%ecx \n\t"	// decrease loop counter
1879
      "jnz             .L12023 \n\t"	// check loop termination, proceed if required
1879
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1880
      "emms                   \n\t"	// exit MMX state
1880
      "emms                   \n\t"	// exit MMX state
1881
      "popa                   \n\t":"=m" (Dest)	// %0
1881
      "popa                   \n\t":"=m" (Dest)	// %0
1882
      :"m"(Src1),		// %1
1882
      :"m"(Src1),		// %1
Lines 1949-1956 Link Here
1949
      "mov         %2, %%ecx  \n\t"	// load loop counter (SIZE) into ecx
1949
      "mov         %2, %%ecx  \n\t"	// load loop counter (SIZE) into ecx
1950
      "shr         $3, %%ecx  \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1950
      "shr         $3, %%ecx  \n\t"	// counter/8 (MMX loads 8 bytes at a time)
1951
       "cmp           $7, %%al \n\t"	// if (N <= 7) execute more efficient code
1951
       "cmp           $7, %%al \n\t"	// if (N <= 7) execute more efficient code
1952
      "jg             .L10281 \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1952
      "jg                  2f \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1953
      ".L10280:               \n\t" "movq    (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1953
      "1: movq (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1954
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1954
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1955
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1955
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1956
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
1956
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
Lines 1961-1969 Link Here
1961
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1961
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1962
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1962
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1963
      "dec              %%ecx \n\t"	// decrease loop counter
1963
      "dec              %%ecx \n\t"	// decrease loop counter
1964
      "jnz            .L10280 \n\t"	// check loop termination, proceed if required
1964
      "jnz                 1b \n\t"	// check loop termination, proceed if required
1965
      "jmp            .L10282 \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1965
      "jmp                 3f \n\t" ".align 16              \n\t"	// 16 byte allignment of the loop entry
1966
      ".L10281:               \n\t" "movq    (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1966
      "2: movq (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
1967
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1967
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
1968
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1968
      "punpcklbw %%mm0, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
1969
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
1969
      "punpckhbw %%mm0, %%mm4 \n\t"	// unpack high bytes of SrcDest into words
Lines 1983-1990 Link Here
1983
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1983
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
1984
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1984
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
1985
      "dec              %%ecx \n\t"	// decrease loop counter
1985
      "dec              %%ecx \n\t"	// decrease loop counter
1986
      "jnz            .L10281 \n\t"	// check loop termination, proceed if required
1986
      "jnz                 2b \n\t"	// check loop termination, proceed if required
1987
       ".L10282:               \n\t" "emms                   \n\t"	// exit MMX state
1987
      "3: emms                \n\t"	// exit MMX state
1988
      "popa                   \n\t":"=m" (Dest)	// %0
1988
      "popa                   \n\t":"=m" (Dest)	// %0
1989
      :"m"(Src1),		// %1
1989
      :"m"(Src1),		// %1
1990
      "m"(length),		// %2
1990
      "m"(length),		// %2
Lines 2063-2069 Link Here
2063
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
2063
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
2064
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
2064
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
2065
       ".align 16              \n\t"	// 16 byte alignment of the loop entry
2065
       ".align 16              \n\t"	// 16 byte alignment of the loop entry
2066
      ".L1029:                \n\t" 
2066
      "1:                     \n\t" 
2067
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
2067
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from SrcDest into MM0
2068
      "paddusb   %%mm2, %%mm0 \n\t"	// MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation)
2068
      "paddusb   %%mm2, %%mm0 \n\t"	// MM0=SrcDest+(0xFF-T) (add 8 bytes with saturation)
2069
      "pcmpeqb   %%mm1, %%mm0 \n\t"	// binarize 255:0, comparing to 255
2069
      "pcmpeqb   %%mm1, %%mm0 \n\t"	// binarize 255:0, comparing to 255
Lines 2071-2077 Link Here
2071
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
2071
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
2072
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
2072
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
2073
      "dec              %%ecx \n\t"	// decrease loop counter
2073
      "dec              %%ecx \n\t"	// decrease loop counter
2074
      "jnz             .L1029 \n\t"	// check loop termination, proceed if required
2074
      "jnz                 1b \n\t"	// check loop termination, proceed if required
2075
       "emms                   \n\t"	// exit MMX state
2075
       "emms                   \n\t"	// exit MMX state
2076
      "popa                   \n\t":"=m" (Dest)	// %0
2076
      "popa                   \n\t":"=m" (Dest)	// %0
2077
      :"m"(Src1),		// %1
2077
      :"m"(Src1),		// %1
Lines 2154-2160 Link Here
2154
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
2154
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
2155
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
2155
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
2156
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
2156
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
2157
      ".L1030:                \n\t" 
2157
      "1:                     \n\t" 
2158
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into MM0
2158
      "movq    (%%eax), %%mm0 \n\t"	// load 8 bytes from Src1 into MM0
2159
      "paddusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest+(0xFF-Tmax)
2159
      "paddusb   %%mm1, %%mm0 \n\t"	// MM0=SrcDest+(0xFF-Tmax)
2160
      "psubusb   %%mm7, %%mm0 \n\t"	// MM0=MM0-(0xFF-Tmax+Tmin)
2160
      "psubusb   %%mm7, %%mm0 \n\t"	// MM0=MM0-(0xFF-Tmax+Tmin)
Lines 2163-2169 Link Here
2163
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
2163
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
2164
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
2164
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
2165
      "dec              %%ecx \n\t"	// decrease loop counter
2165
      "dec              %%ecx \n\t"	// decrease loop counter
2166
      "jnz             .L1030 \n\t"	// check loop termination, proceed if required
2166
      "jnz                 1b \n\t"	// check loop termination, proceed if required
2167
       "emms                   \n\t"	// exit MMX state
2167
       "emms                   \n\t"	// exit MMX state
2168
      "popa                   \n\t":"=m" (Dest)	// %0
2168
      "popa                   \n\t":"=m" (Dest)	// %0
2169
      :"m"(Src1),		// %1
2169
      :"m"(Src1),		// %1
Lines 2231-2241 Link Here
2231
      "mov           %4, %%bx \n\t"	// load Cmax in BX
2231
      "mov           %4, %%bx \n\t"	// load Cmax in BX
2232
      "sub           %5, %%ax \n\t"	// AX = Nmax - Nmin
2232
      "sub           %5, %%ax \n\t"	// AX = Nmax - Nmin
2233
      "sub           %3, %%bx \n\t"	// BX = Cmax - Cmin
2233
      "sub           %3, %%bx \n\t"	// BX = Cmax - Cmin
2234
      "jz             .L10311 \n\t"	// check division by zero
2234
      "jz                  1f \n\t"	// check division by zero
2235
      "xor         %%dx, %%dx \n\t"	// prepare for division, zero DX
2235
      "xor         %%dx, %%dx \n\t"	// prepare for division, zero DX
2236
      "div               %%bx \n\t"	// AX = AX/BX
2236
      "div               %%bx \n\t"	// AX = AX/BX
2237
      "jmp            .L10312 \n\t" ".L10311:               \n\t" "mov         $255, %%ax \n\t"	// if div by zero, assume result max. byte value
2237
      "jmp                 2f \n\t" "1:                     \n\t" "mov         $255, %%ax \n\t"	// if div by zero, assume result max. byte value
2238
       ".L10312:               \n\t"	// ** Duplicate AX in 4 words of MM0 **
2238
       "2:                    \n\t"	// ** Duplicate AX in 4 words of MM0 **
2239
       "mov         %%ax, %%bx \n\t"	// copy AX into BX
2239
       "mov         %%ax, %%bx \n\t"	// copy AX into BX
2240
      "shl         $16, %%eax \n\t"	// shift 2 bytes of EAX left
2240
      "shl         $16, %%eax \n\t"	// shift 2 bytes of EAX left
2241
      "mov         %%bx, %%ax \n\t"	// copy BX into AX
2241
      "mov         %%bx, %%ax \n\t"	// copy BX into AX
Lines 2264-2270 Link Here
2264
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
2264
      "mov          %2, %%ecx \n\t"	// load loop counter (SIZE) into ecx
2265
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
2265
      "shr          $3, %%ecx \n\t"	// counter/8 (MMX loads 8 bytes at a time)
2266
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
2266
       ".align 16              \n\t"	// 16 byte allignment of the loop entry
2267
      ".L1031:                \n\t" 
2267
      "1:                     \n\t" 
2268
      "movq    (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
2268
      "movq    (%%eax), %%mm3 \n\t"	// load 8 bytes from Src1 into MM3
2269
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
2269
      "movq      %%mm3, %%mm4 \n\t"	// copy MM3 into MM4 
2270
      "punpcklbw %%mm7, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
2270
      "punpcklbw %%mm7, %%mm3 \n\t"	// unpack low  bytes of SrcDest into words
Lines 2289-2295 Link Here
2289
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
2289
       "add          $8, %%eax \n\t"	// increase Src1 register pointer by 8
2290
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
2290
      "add          $8, %%edi \n\t"	// increase Dest register pointer by 8
2291
      "dec              %%ecx \n\t"	// decrease loop counter
2291
      "dec              %%ecx \n\t"	// decrease loop counter
2292
      "jnz             .L1031 \n\t"	// check loop termination, proceed if required
2292
      "jnz                 1b \n\t"	// check loop termination, proceed if required
2293
       "emms                   \n\t"	// exit MMX state
2293
       "emms                   \n\t"	// exit MMX state
2294
      "popa                   \n\t":"=m" (Dest)	// %0
2294
      "popa                   \n\t":"=m" (Dest)	// %0
2295
      :"m"(Src1),		// %1
2295
      :"m"(Src1),		// %1

Return to bug 219621