gcc 4.4.5, CFLAGS="-march=athlon-xp -O2 -pipe" Using the following test patch, we can add a relatively unique FLDPI instruction in the code, then find it in the disassembly listing and see how it can be reached. Supposedly we can't reach FLDPI from any MMX instruction without having EMMS in between. PS. Adding 'asm volatile ("emms")' before the first 'if' expression would make the whole test setup a bit more convincing, but I'm just lazy to update the disassembly listing. diff --git a/pixman/pixman-fast-path.h b/pixman/pixman-fast-path.h index 6fe448c..b6b921a 100644 --- a/pixman/pixman-fast-path.h +++ b/pixman/pixman-fast-path.h @@ -333,16 +333,21 @@ fast_composite_scaled_nearest_ ## scale_func_name (pixman_implementation_t *imp, if (left_pad > 0) \ { \ scanline_func (dst, src, left_pad, 0, 0, 0); \ + asm volatile ("emms"); \ } \ if (width > 0) \ { \ scanline_func (dst + left_pad, src, width, vx, unit_x, 0); \ + asm volatile ("emms"); \ } \ if (right_pad > 0) \ { \ scanline_func (dst + left_pad + width, src + src_image->bits.width - 1, \ right_pad, 0, 0, 0); \ + asm volatile ("emms"); \ } \ + asm volatile ("fldpi"); \ + asm volatile ("emms"); \ } \ else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ { \ Now a disassembly listing of 'fast_composite_scaled_nearest_sse2_8888_8888_pad_OVER', which is causing assert problem in pixman test: 00000250 : 250: 55 push %ebp 251: 89 e5 mov %esp,%ebp 253: 57 push %edi 254: 56 push %esi 255: 53 push %ebx 256: 81 ec bc 00 00 00 sub $0xbc,%esp 25c: 8b 4d 18 mov 0x18(%ebp),%ecx 25f: 8b 75 10 mov 0x10(%ebp),%esi 262: 8b 45 20 mov 0x20(%ebp),%eax 265: 8b 55 1c mov 0x1c(%ebp),%edx 268: 8b 99 84 00 00 00 mov 0x84(%ecx),%ebx 26e: c1 e0 10 shl $0x10,%eax 271: c1 e2 10 shl $0x10,%edx 274: 05 00 80 00 00 add $0x8000,%eax 279: 81 c2 00 80 00 00 add $0x8000,%edx 27f: 89 5d b4 mov %ebx,-0x4c(%ebp) 282: 8b 89 8c 00 00 00 mov 0x8c(%ecx),%ecx 288: 8b 5d 10 mov 0x10(%ebp),%ebx 28b: 89 4d b8 mov %ecx,-0x48(%ebp) 28e: 8b 4d 10 mov 0x10(%ebp),%ecx 291: 8b b6 84 00 00 00 mov 0x84(%esi),%esi 297: 89 b5 70 ff ff ff mov %esi,-0x90(%ebp) 29d: 8b 89 8c 00 00 00 mov 0x8c(%ecx),%ecx 2a3: 89 45 e0 mov %eax,-0x20(%ebp) 2a6: 8d 45 dc lea -0x24(%ebp),%eax 2a9: 89 55 dc mov %edx,-0x24(%ebp) 2ac: c7 45 e4 00 00 01 00 movl $0x10000,-0x1c(%ebp) 2b3: 89 8d 5c ff ff ff mov %ecx,-0xa4(%ebp) 2b9: 89 44 24 04 mov %eax,0x4(%esp) 2bd: 8b 43 30 mov 0x30(%ebx),%eax 2c0: 89 04 24 mov %eax,(%esp) 2c3: e8 fc ff ff ff call 2c4 2c8: 85 c0 test %eax,%eax 2ca: 0f 84 a0 03 00 00 je 670 2d0: 8b 43 30 mov 0x30(%ebx),%eax 2d3: 8b 7d dc mov -0x24(%ebp),%edi 2d6: 8b 30 mov (%eax),%esi 2d8: 4f dec %edi 2d9: 89 75 b0 mov %esi,-0x50(%ebp) 2dc: 8b 40 10 mov 0x10(%eax),%eax 2df: 89 7d dc mov %edi,-0x24(%ebp) 2e2: 89 85 68 ff ff ff mov %eax,-0x98(%ebp) 2e8: 8b 45 e0 mov -0x20(%ebp),%eax 2eb: 48 dec %eax 2ec: 85 ff test %edi,%edi 2ee: 89 45 8c mov %eax,-0x74(%ebp) 2f1: 89 45 e0 mov %eax,-0x20(%ebp) 2f4: 8b 4b 7c mov 0x7c(%ebx),%ecx 2f7: 0f 88 ce 08 00 00 js bcb 2fd: 89 f0 mov %esi,%eax 2ff: 89 f3 mov %esi,%ebx 301: 89 7d a0 mov %edi,-0x60(%ebp) 304: 99 cltd 305: 83 c0 ff add $0xffffffff,%eax 308: c7 45 a8 00 00 00 00 movl $0x0,-0x58(%ebp) 30f: 89 d6 mov %edx,%esi 311: 83 d2 ff adc $0xffffffff,%edx 314: 89 45 98 mov %eax,-0x68(%ebp) 317: 89 55 9c mov %edx,-0x64(%ebp) 31a: 89 fa mov %edi,%edx 31c: c7 45 ac 00 00 00 00 movl $0x0,-0x54(%ebp) 323: c1 fa 1f sar $0x1f,%edx 326: c7 45 90 00 00 00 00 movl $0x0,-0x70(%ebp) 32d: c7 85 7c ff ff ff 00 movl $0x0,-0x84(%ebp) 334: 00 00 00 337: 89 55 a4 mov %edx,-0x5c(%ebp) 33a: 89 c8 mov %ecx,%eax 33c: 89 5c 24 08 mov %ebx,0x8(%esp) 340: 89 74 24 0c mov %esi,0xc(%esp) 344: 99 cltd 345: c1 e0 10 shl $0x10,%eax 348: 0f a4 ca 10 shld $0x10,%ecx,%edx 34c: 03 45 98 add -0x68(%ebp),%eax 34f: 13 55 9c adc -0x64(%ebp),%edx 352: 2b 45 a0 sub -0x60(%ebp),%eax 355: 1b 55 a4 sbb -0x5c(%ebp),%edx 358: 89 04 24 mov %eax,(%esp) 35b: 89 54 24 04 mov %edx,0x4(%esp) 35f: e8 fc ff ff ff call 360 364: 89 c3 mov %eax,%ebx 366: 2b 5d a8 sub -0x58(%ebp),%ebx 369: 89 d6 mov %edx,%esi 36b: 1b 75 ac sbb -0x54(%ebp),%esi 36e: 8b 55 34 mov 0x34(%ebp),%edx 371: c7 45 80 00 00 00 00 movl $0x0,-0x80(%ebp) 378: 89 95 74 ff ff ff mov %edx,-0x8c(%ebp) 37e: 85 f6 test %esi,%esi 380: 78 27 js 3a9 382: 89 d0 mov %edx,%eax 384: 99 cltd 385: 89 45 c0 mov %eax,-0x40(%ebp) 388: 39 d6 cmp %edx,%esi 38a: 89 55 c4 mov %edx,-0x3c(%ebp) 38d: 0f 8c 25 08 00 00 jl bb8 393: 0f 8e 17 08 00 00 jle bb0 399: 8b 55 34 mov 0x34(%ebp),%edx 39c: c7 85 74 ff ff ff 00 movl $0x0,-0x8c(%ebp) 3a3: 00 00 00 3a6: 89 55 80 mov %edx,-0x80(%ebp) 3a9: 03 7d 90 add -0x70(%ebp),%edi 3ac: 8b 5d 38 mov 0x38(%ebp),%ebx 3af: 4b dec %ebx 3b0: 89 bd 6c ff ff ff mov %edi,-0x94(%ebp) 3b6: 89 5d 94 mov %ebx,-0x6c(%ebp) 3b9: 0f 88 b1 02 00 00 js 670 3bf: 8b 45 b8 mov -0x48(%ebp),%eax 3c2: 8b 5d b0 mov -0x50(%ebp),%ebx 3c5: 0f af 45 30 imul 0x30(%ebp),%eax 3c9: 8b 55 b8 mov -0x48(%ebp),%edx 3cc: 8b 75 b4 mov -0x4c(%ebp),%esi 3cf: 01 db add %ebx,%ebx 3d1: 8b 8d 7c ff ff ff mov -0x84(%ebp),%ecx 3d7: 89 5d a8 mov %ebx,-0x58(%ebp) 3da: 03 5d b0 add -0x50(%ebp),%ebx 3dd: c1 e2 02 shl $0x2,%edx 3e0: 03 45 2c add 0x2c(%ebp),%eax 3e3: 89 55 90 mov %edx,-0x70(%ebp) 3e6: 8b 95 7c ff ff ff mov -0x84(%ebp),%edx 3ec: 03 55 80 add -0x80(%ebp),%edx 3ef: 89 5d a0 mov %ebx,-0x60(%ebp) 3f2: 03 5d b0 add -0x50(%ebp),%ebx 3f5: 8d 04 86 lea (%esi,%eax,4),%eax 3f8: 8d 14 90 lea (%eax,%edx,4),%edx 3fb: 8d 0c 88 lea (%eax,%ecx,4),%ecx 3fe: 89 5d 98 mov %ebx,-0x68(%ebp) 401: 89 55 84 mov %edx,-0x7c(%ebp) 404: 89 4d 88 mov %ecx,-0x78(%ebp) 407: 90 nop 408: 8b 55 8c mov -0x74(%ebp),%edx 40b: 8b 75 90 mov -0x70(%ebp),%esi 40e: 8b 5d 10 mov 0x10(%ebp),%ebx 411: 8b 8d 68 ff ff ff mov -0x98(%ebp),%ecx 417: 01 4d 8c add %ecx,-0x74(%ebp) 41a: 01 c6 add %eax,%esi 41c: c1 fa 10 sar $0x10,%edx 41f: 89 b5 78 ff ff ff mov %esi,-0x88(%ebp) 425: 8b 8b 80 00 00 00 mov 0x80(%ebx),%ecx 42b: 31 db xor %ebx,%ebx 42d: 85 d2 test %edx,%edx 42f: 78 14 js 445 431: 39 ca cmp %ecx,%edx 433: 0f 8d 47 02 00 00 jge 680 439: 8b 9d 5c ff ff ff mov -0xa4(%ebp),%ebx 43f: 0f af da imul %edx,%ebx 442: c1 e3 02 shl $0x2,%ebx 445: 03 9d 70 ff ff ff add -0x90(%ebp),%ebx 44b: 8b 95 7c ff ff ff mov -0x84(%ebp),%edx 451: 0f ef db pxor %mm3,%mm3 454: 85 d2 test %edx,%edx 456: 89 df mov %ebx,%edi 458: 7f 10 jg 46a 45a: e9 87 00 00 00 jmp 4e6 45f: 90 nop 460: 89 f1 mov %esi,%ecx 462: 89 08 mov %ecx,(%eax) 464: 83 c0 04 add $0x4,%eax 467: 4a dec %edx 468: 74 76 je 4e0 46a: a8 0f test $0xf,%al 46c: 0f 84 a6 03 00 00 je 818 472: 8b 0f mov (%edi),%ecx 474: 8b 30 mov (%eax),%esi 476: 89 cb mov %ecx,%ebx 478: c1 eb 18 shr $0x18,%ebx 47b: fe c3 inc %bl 47d: 74 e3 je 462 47f: 85 c9 test %ecx,%ecx 481: 74 dd je 460 483: 66 0f 6e c1 movd %ecx,%xmm0 487: 0f ef c0 pxor %mm0,%mm0 48a: 66 0f d6 45 c0 movq %xmm0,-0x40(%ebp) 48f: 0f 6f 4d c0 movq -0x40(%ebp),%mm1 493: 66 0f 6e c6 movd %esi,%xmm0 497: 0f 60 c8 punpcklbw %mm0,%mm1 49a: 66 0f d6 45 c0 movq %xmm0,-0x40(%ebp) 49f: 0f 6f 55 c0 movq -0x40(%ebp),%mm2 4a3: 0f 60 d0 punpcklbw %mm0,%mm2 4a6: 0f 70 c1 ff pshufw $0xff,%mm1,%mm0 4aa: 0f ef 05 08 00 00 00 pxor 0x8,%mm0 4b1: 0f d5 c2 pmullw %mm2,%mm0 4b4: 0f dd 05 00 00 00 00 paddusw 0x0,%mm0 4bb: 0f e4 05 10 00 00 00 pmulhuw 0x10,%mm0 4c2: 0f dc c1 paddusb %mm1,%mm0 4c5: 0f 67 c3 packuswb %mm3,%mm0 4c8: 0f 7e c1 movd %mm0,%ecx 4cb: 0f 7e 85 54 ff ff ff movd %mm0,-0xac(%ebp) 4d2: 89 08 mov %ecx,(%eax) 4d4: 83 c0 04 add $0x4,%eax 4d7: 4a dec %edx 4d8: 75 90 jne 46a 4da: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 4e0: 89 fb mov %edi,%ebx 4e2: 0f 77 emms 4e4: 0f 77 emms (0) ^^^^ this is final EMMS from the inlined 'scanline_func' followed by an extra EMMS added by the test patch 4e6: 8b 4d 80 mov -0x80(%ebp),%ecx 4e9: 8b 45 88 mov -0x78(%ebp),%eax 4ec: 0f ef db pxor %mm3,%mm3 (1) ^^^^ this stray PXOR happens after EMMS and is causing problems 4ef: 8b 95 6c ff ff ff mov -0x94(%ebp),%edx 4f5: 85 c9 test %ecx,%ecx 4f7: 0f 8e a2 00 00 00 jle 59f (2) ^^^ here we jump to 59f 4fd: 89 4d b4 mov %ecx,-0x4c(%ebp) 500: 89 5d c0 mov %ebx,-0x40(%ebp) 503: eb 13 jmp 518 505: 8d 76 00 lea 0x0(%esi),%esi 508: 89 fb mov %edi,%ebx 50a: 89 18 mov %ebx,(%eax) 50c: 83 c0 04 add $0x4,%eax 50f: ff 4d b4 decl -0x4c(%ebp) 512: 0f 84 80 00 00 00 je 598 518: a8 0f test $0xf,%al 51a: 0f 84 78 04 00 00 je 998 520: 8b 4d c0 mov -0x40(%ebp),%ecx 523: 89 d3 mov %edx,%ebx 525: 03 55 b0 add -0x50(%ebp),%edx 528: c1 fb 10 sar $0x10,%ebx 52b: 8b 38 mov (%eax),%edi 52d: 8b 1c 99 mov (%ecx,%ebx,4),%ebx 530: 89 de mov %ebx,%esi 532: c1 ee 18 shr $0x18,%esi 535: 89 f1 mov %esi,%ecx 537: fe c1 inc %cl 539: 74 cf je 50a 53b: 85 db test %ebx,%ebx 53d: 74 c9 je 508 53f: 66 0f 6e c3 movd %ebx,%xmm0 543: 0f ef c0 pxor %mm0,%mm0 546: 66 0f d6 45 b8 movq %xmm0,-0x48(%ebp) 54b: 0f 6f 4d b8 movq -0x48(%ebp),%mm1 54f: 66 0f 6e c7 movd %edi,%xmm0 553: 0f 60 c8 punpcklbw %mm0,%mm1 556: 66 0f d6 45 b8 movq %xmm0,-0x48(%ebp) 55b: 0f 6f 55 b8 movq -0x48(%ebp),%mm2 55f: 0f 60 d0 punpcklbw %mm0,%mm2 562: 0f 70 c1 ff pshufw $0xff,%mm1,%mm0 566: 0f ef 05 08 00 00 00 pxor 0x8,%mm0 56d: 0f d5 c2 pmullw %mm2,%mm0 570: 0f dd 05 00 00 00 00 paddusw 0x0,%mm0 577: 0f e4 05 10 00 00 00 pmulhuw 0x10,%mm0 57e: 0f dc c1 paddusb %mm1,%mm0 581: 0f 67 c3 packuswb %mm3,%mm0 584: 0f 7e c3 movd %mm0,%ebx 587: 0f 7e 85 54 ff ff ff movd %mm0,-0xac(%ebp) 58e: 89 18 mov %ebx,(%eax) 590: 83 c0 04 add $0x4,%eax 593: ff 4d b4 decl -0x4c(%ebp) 596: 75 80 jne 518 598: 8b 5d c0 mov -0x40(%ebp),%ebx 59b: 0f 77 emms 59d: 0f 77 emms 59f: 8b 85 74 ff ff ff mov -0x8c(%ebp),%eax 5a5: 85 c0 test %eax,%eax 5a7: 0f 8e 9f 00 00 00 jle 64c (3) ^^^ here we jump to 64c 5ad: 8b 55 10 mov 0x10(%ebp),%edx 5b0: 0f ef db pxor %mm3,%mm3 5b3: 8b 42 7c mov 0x7c(%edx),%eax 5b6: 8b 95 74 ff ff ff mov -0x8c(%ebp),%edx 5bc: 8d 5c 83 fc lea -0x4(%ebx,%eax,4),%ebx 5c0: 8b 45 84 mov -0x7c(%ebp),%eax 5c3: 89 df mov %ebx,%edi 5c5: eb 0b jmp 5d2 5c7: 90 nop 5c8: 89 f1 mov %esi,%ecx 5ca: 89 08 mov %ecx,(%eax) 5cc: 83 c0 04 add $0x4,%eax 5cf: 4a dec %edx 5d0: 74 76 je 648 5d2: a8 0f test $0xf,%al 5d4: 0f 84 be 00 00 00 je 698 5da: 8b 0f mov (%edi),%ecx 5dc: 8b 30 mov (%eax),%esi 5de: 89 cb mov %ecx,%ebx 5e0: c1 eb 18 shr $0x18,%ebx 5e3: fe c3 inc %bl 5e5: 74 e3 je 5ca 5e7: 85 c9 test %ecx,%ecx 5e9: 74 dd je 5c8 5eb: 66 0f 6e c1 movd %ecx,%xmm0 5ef: 0f ef c0 pxor %mm0,%mm0 5f2: 66 0f d6 45 c0 movq %xmm0,-0x40(%ebp) 5f7: 0f 6f 4d c0 movq -0x40(%ebp),%mm1 5fb: 66 0f 6e c6 movd %esi,%xmm0 5ff: 0f 60 c8 punpcklbw %mm0,%mm1 602: 66 0f d6 45 c0 movq %xmm0,-0x40(%ebp) 607: 0f 6f 55 c0 movq -0x40(%ebp),%mm2 60b: 0f 60 d0 punpcklbw %mm0,%mm2 60e: 0f 70 c1 ff pshufw $0xff,%mm1,%mm0 612: 0f ef 05 08 00 00 00 pxor 0x8,%mm0 619: 0f d5 c2 pmullw %mm2,%mm0 61c: 0f dd 05 00 00 00 00 paddusw 0x0,%mm0 623: 0f e4 05 10 00 00 00 pmulhuw 0x10,%mm0 62a: 0f dc c1 paddusb %mm1,%mm0 62d: 0f 67 c3 packuswb %mm3,%mm0 630: 0f 7e c1 movd %mm0,%ecx 633: 0f 7e 85 54 ff ff ff movd %mm0,-0xac(%ebp) 63a: 89 08 mov %ecx,(%eax) 63c: 83 c0 04 add $0x4,%eax 63f: 4a dec %edx 640: 75 90 jne 5d2 642: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 648: 0f 77 emms 64a: 0f 77 emms 64c: d9 eb fldpi 64e: 0f 77 emms (4) ^^^^ OOPS, this EMMS was supposed to be unneeded, but we actually reached it and can eventually return from the function without cleaning up after PXOR 650: 8b 45 90 mov -0x70(%ebp),%eax 653: 01 45 84 add %eax,-0x7c(%ebp) 656: 01 45 88 add %eax,-0x78(%ebp) 659: ff 4d 94 decl -0x6c(%ebp) 65c: 78 12 js 670 65e: 8b 85 78 ff ff ff mov -0x88(%ebp),%eax 664: e9 9f fd ff ff jmp 408 669: 8d b4 26 00 00 00 00 lea 0x0(%esi,%eiz,1),%esi 670: 81 c4 bc 00 00 00 add $0xbc,%esp 676: 5b pop %ebx 677: 5e pop %esi 678: 5f pop %edi 679: c9 leave 67a: c3 ret (5) ^^^ yes, for example we can return here 67b: 90 nop 67c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 680: 8d 59 ff lea -0x1(%ecx),%ebx 683: 0f af 9d 5c ff ff ff imul -0xa4(%ebp),%ebx 68a: c1 e3 02 shl $0x2,%ebx 68d: e9 b3 fd ff ff jmp 445 692: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 698: 83 fa 03 cmp $0x3,%edx 69b: 89 fb mov %edi,%ebx 69d: 89 c6 mov %eax,%esi 69f: 89 d7 mov %edx,%edi 6a1: 0f 8e ec 00 00 00 jle 793 6a7: 66 0f ef db pxor %xmm3,%xmm3 6ab: 90 nop 6ac: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 6b0: 66 0f 6e 0b movd (%ebx),%xmm1 6b4: 66 0f 70 c1 00 pshufd $0x0,%xmm1,%xmm0 6b9: f3 0f 6f d0 movdqu %xmm0,%xmm2 6bd: 0f 29 45 c8 movaps %xmm0,-0x38(%ebp) 6c1: 66 0f 6f ca movdqa %xmm2,%xmm1 6c5: 66 0f 6f c2 movdqa %xmm2,%xmm0 6c9: 66 0f 74 ca pcmpeqb %xmm2,%xmm1 6cd: 66 0f 74 ca pcmpeqb %xmm2,%xmm1 6d1: 66 0f d7 c9 pmovmskb %xmm1,%ecx 6d5: 81 e1 88 88 00 00 and $0x8888,%ecx 6db: 81 f9 88 88 00 00 cmp $0x8888,%ecx 6e1: 0f 84 88 00 00 00 je 76f 6e7: 66 0f 6f ca movdqa %xmm2,%xmm1 6eb: 66 0f 74 cb pcmpeqb %xmm3,%xmm1 6ef: 66 0f d7 c9 pmovmskb %xmm1,%ecx 6f3: 81 f9 ff ff 00 00 cmp $0xffff,%ecx 6f9: 74 77 je 772 6fb: 66 0f 6f e2 movdqa %xmm2,%xmm4 6ff: 66 0f 68 c3 punpckhbw %xmm3,%xmm0 703: 66 0f 6f 08 movdqa (%eax),%xmm1 707: 66 0f 60 e3 punpcklbw %xmm3,%xmm4 70b: 66 0f 6f f1 movdqa %xmm1,%xmm6 70f: f2 0f 70 f8 ff pshuflw $0xff,%xmm0,%xmm7 714: 66 0f 6f 2d 40 00 00 movdqa 0x40,%xmm5 71b: 00 71c: f2 0f 70 d4 ff pshuflw $0xff,%xmm4,%xmm2 721: f3 0f 70 ff ff pshufhw $0xff,%xmm7,%xmm7 726: f3 0f 70 d2 ff pshufhw $0xff,%xmm2,%xmm2 72b: 66 0f 60 f3 punpcklbw %xmm3,%xmm6 72f: 66 0f ef d5 pxor %xmm5,%xmm2 733: 66 0f 68 cb punpckhbw %xmm3,%xmm1 737: 66 0f d5 d6 pmullw %xmm6,%xmm2 73b: 66 0f ef ef pxor %xmm7,%xmm5 73f: 66 0f d5 cd pmullw %xmm5,%xmm1 743: 66 0f 6f 2d 30 00 00 movdqa 0x30,%xmm5 74a: 00 74b: 66 0f dd d5 paddusw %xmm5,%xmm2 74f: 66 0f dd cd paddusw %xmm5,%xmm1 753: 66 0f 6f 2d 50 00 00 movdqa 0x50,%xmm5 75a: 00 75b: 66 0f e4 d5 pmulhuw %xmm5,%xmm2 75f: 66 0f e4 cd pmulhuw %xmm5,%xmm1 763: 66 0f dc d4 paddusb %xmm4,%xmm2 767: 66 0f dc c1 paddusb %xmm1,%xmm0 76b: 66 0f 67 d0 packuswb %xmm0,%xmm2 76f: 0f 29 10 movaps %xmm2,(%eax) 772: 83 ea 04 sub $0x4,%edx 775: 83 c0 10 add $0x10,%eax 778: 83 fa 03 cmp $0x3,%edx 77b: 0f 8f 2f ff ff ff jg 6b0 781: 8d 57 fc lea -0x4(%edi),%edx 784: 89 d0 mov %edx,%eax 786: 83 e2 03 and $0x3,%edx 789: c1 e8 02 shr $0x2,%eax 78c: c1 e0 04 shl $0x4,%eax 78f: 8d 44 30 10 lea 0x10(%eax,%esi,1),%eax 793: 85 d2 test %edx,%edx 795: 0f 84 ad fe ff ff je 648 79b: 0f ef db pxor %mm3,%mm3 79e: 89 df mov %ebx,%edi 7a0: eb 14 jmp 7b6 7a2: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 7a8: 89 f1 mov %esi,%ecx 7aa: 4a dec %edx 7ab: 89 08 mov %ecx,(%eax) 7ad: 0f 84 95 fe ff ff je 648 7b3: 83 c0 04 add $0x4,%eax 7b6: 8b 0f mov (%edi),%ecx 7b8: 8b 30 mov (%eax),%esi 7ba: 89 cb mov %ecx,%ebx 7bc: c1 eb 18 shr $0x18,%ebx 7bf: fe c3 inc %bl 7c1: 74 e7 je 7aa 7c3: 85 c9 test %ecx,%ecx 7c5: 74 e1 je 7a8 7c7: 0f ef c0 pxor %mm0,%mm0 7ca: 66 0f 6e d1 movd %ecx,%xmm2 7ce: 66 0f 6e c6 movd %esi,%xmm0 7d2: 66 0f d6 55 c0 movq %xmm2,-0x40(%ebp) 7d7: 0f 6f 4d c0 movq -0x40(%ebp),%mm1 7db: 0f 60 c8 punpcklbw %mm0,%mm1 7de: 66 0f d6 45 c0 movq %xmm0,-0x40(%ebp) 7e3: 0f 6f 55 c0 movq -0x40(%ebp),%mm2 7e7: 0f 60 d0 punpcklbw %mm0,%mm2 7ea: 0f 70 c1 ff pshufw $0xff,%mm1,%mm0 7ee: 0f ef 05 08 00 00 00 pxor 0x8,%mm0 7f5: 0f d5 c2 pmullw %mm2,%mm0 7f8: 0f dd 05 00 00 00 00 paddusw 0x0,%mm0 7ff: 0f e4 05 10 00 00 00 pmulhuw 0x10,%mm0 806: 0f dc c1 paddusb %mm1,%mm0 809: 0f 67 c3 packuswb %mm3,%mm0 80c: 0f 7e 85 54 ff ff ff movd %mm0,-0xac(%ebp) 813: 0f 7e c1 movd %mm0,%ecx 816: eb 92 jmp 7aa 818: 83 fa 03 cmp $0x3,%edx 81b: 89 fb mov %edi,%ebx 81d: 89 c6 mov %eax,%esi 81f: 89 d7 mov %edx,%edi 821: 0f 8e ec 00 00 00 jle 913 827: 66 0f ef db pxor %xmm3,%xmm3 82b: 90 nop 82c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi 830: 66 0f 6e 0b movd (%ebx),%xmm1 834: 66 0f 70 c1 00 pshufd $0x0,%xmm1,%xmm0 839: f3 0f 6f d0 movdqu %xmm0,%xmm2 83d: 0f 29 45 c8 movaps %xmm0,-0x38(%ebp) 841: 66 0f 6f ca movdqa %xmm2,%xmm1 845: 66 0f 6f c2 movdqa %xmm2,%xmm0 849: 66 0f 74 ca pcmpeqb %xmm2,%xmm1 84d: 66 0f 74 ca pcmpeqb %xmm2,%xmm1 851: 66 0f d7 c9 pmovmskb %xmm1,%ecx 855: 81 e1 88 88 00 00 and $0x8888,%ecx 85b: 81 f9 88 88 00 00 cmp $0x8888,%ecx 861: 0f 84 88 00 00 00 je 8ef 867: 66 0f 6f ca movdqa %xmm2,%xmm1 86b: 66 0f 74 cb pcmpeqb %xmm3,%xmm1 86f: 66 0f d7 c9 pmovmskb %xmm1,%ecx 873: 81 f9 ff ff 00 00 cmp $0xffff,%ecx 879: 74 77 je 8f2 87b: 66 0f 6f e2 movdqa %xmm2,%xmm4 87f: 66 0f 68 c3 punpckhbw %xmm3,%xmm0 883: 66 0f 6f 08 movdqa (%eax),%xmm1 887: 66 0f 60 e3 punpcklbw %xmm3,%xmm4 88b: 66 0f 6f f1 movdqa %xmm1,%xmm6 88f: f2 0f 70 f8 ff pshuflw $0xff,%xmm0,%xmm7 894: 66 0f 6f 2d 40 00 00 movdqa 0x40,%xmm5 89b: 00 89c: f2 0f 70 d4 ff pshuflw $0xff,%xmm4,%xmm2 8a1: f3 0f 70 ff ff pshufhw $0xff,%xmm7,%xmm7 8a6: f3 0f 70 d2 ff pshufhw $0xff,%xmm2,%xmm2 8ab: 66 0f 60 f3 punpcklbw %xmm3,%xmm6 8af: 66 0f ef d5 pxor %xmm5,%xmm2 8b3: 66 0f 68 cb punpckhbw %xmm3,%xmm1 8b7: 66 0f d5 d6 pmullw %xmm6,%xmm2 8bb: 66 0f ef ef pxor %xmm7,%xmm5 8bf: 66 0f d5 cd pmullw %xmm5,%xmm1 8c3: 66 0f 6f 2d 30 00 00 movdqa 0x30,%xmm5 8ca: 00 8cb: 66 0f dd d5 paddusw %xmm5,%xmm2 8cf: 66 0f dd cd paddusw %xmm5,%xmm1 8d3: 66 0f 6f 2d 50 00 00 movdqa 0x50,%xmm5 8da: 00 8db: 66 0f e4 d5 pmulhuw %xmm5,%xmm2 8df: 66 0f e4 cd pmulhuw %xmm5,%xmm1 8e3: 66 0f dc d4 paddusb %xmm4,%xmm2 8e7: 66 0f dc c1 paddusb %xmm1,%xmm0 8eb: 66 0f 67 d0 packuswb %xmm0,%xmm2 8ef: 0f 29 10 movaps %xmm2,(%eax) 8f2: 83 ea 04 sub $0x4,%edx 8f5: 83 c0 10 add $0x10,%eax 8f8: 83 fa 03 cmp $0x3,%edx 8fb: 0f 8f 2f ff ff ff jg 830 901: 8d 57 fc lea -0x4(%edi),%edx 904: 89 d0 mov %edx,%eax 906: 83 e2 03 and $0x3,%edx 909: c1 e8 02 shr $0x2,%eax 90c: c1 e0 04 shl $0x4,%eax 90f: 8d 44 30 10 lea 0x10(%eax,%esi,1),%eax 913: 85 d2 test %edx,%edx 915: 0f 84 c7 fb ff ff je 4e2 91b: 0f ef db pxor %mm3,%mm3 91e: 89 df mov %ebx,%edi 920: eb 14 jmp 936 922: 8d b6 00 00 00 00 lea 0x0(%esi),%esi 928: 89 f1 mov %esi,%ecx 92a: 4a dec %edx 92b: 89 08 mov %ecx,(%eax) 92d: 0f 84 ad fb ff ff je 4e0 933: 83 c0 04 add $0x4,%eax 936: 8b 0f mov (%edi),%ecx 938: 8b 30 mov (%eax),%esi 93a: 89 cb mov %ecx,%ebx 93c: c1 eb 18 shr $0x18,%ebx 93f: fe c3 inc %bl 941: 74 e7 je 92a 943: 85 c9 test %ecx,%ecx 945: 74 e1 je 928 947: 0f ef c0 pxor %mm0,%mm0 94a: 66 0f 6e d1 movd %ecx,%xmm2 94e: 66 0f 6e c6 movd %esi,%xmm0 952: 66 0f d6 55 c0 movq %xmm2,-0x40(%ebp) 957: 0f 6f 4d c0 movq -0x40(%ebp),%mm1 95b: 0f 60 c8 punpcklbw %mm0,%mm1 95e: 66 0f d6 45 c0 movq %xmm0,-0x40(%ebp) 963: 0f 6f 55 c0 movq -0x40(%ebp),%mm2 967: 0f 60 d0 punpcklbw %mm0,%mm2 96a: 0f 70 c1 ff pshufw $0xff,%mm1,%mm0 96e: 0f ef 05 08 00 00 00 pxor 0x8,%mm0 975: 0f d5 c2 pmullw %mm2,%mm0 978: 0f dd 05 00 00 00 00 paddusw 0x0,%mm0 97f: 0f e4 05 10 00 00 00 pmulhuw 0x10,%mm0 986: 0f dc c1 paddusb %mm1,%mm0 989: 0f 67 c3 packuswb %mm3,%mm0 98c: 0f 7e 85 54 ff ff ff movd %mm0,-0xac(%ebp) 993: 0f 7e c1 movd %mm0,%ecx 996: eb 92 jmp 92a 998: 8b 4d b4 mov -0x4c(%ebp),%ecx 99b: 8b 5d c0 mov -0x40(%ebp),%ebx 99e: 89 85 60 ff ff ff mov %eax,-0xa0(%ebp) 9a4: 83 f9 03 cmp $0x3,%ecx 9a7: 89 8d 64 ff ff ff mov %ecx,-0x9c(%ebp) 9ad: 0f 8e 65 01 00 00 jle b18 9b3: 66 0f ef db pxor %xmm3,%xmm3 9b7: 89 95 50 ff ff ff mov %edx,-0xb0(%ebp) 9bd: 89 4d c0 mov %ecx,-0x40(%ebp) 9c0: 8b 95 50 ff ff ff mov -0xb0(%ebp),%edx 9c6: 8b bd 50 ff ff ff mov -0xb0(%ebp),%edi 9cc: 8b 75 b0 mov -0x50(%ebp),%esi 9cf: 8b 4d 98 mov -0x68(%ebp),%ecx 9d2: 01 8d 50 ff ff ff add %ecx,-0xb0(%ebp) 9d8: c1 fa 10 sar $0x10,%edx 9db: 89 55 b8 mov %edx,-0x48(%ebp) 9de: 8b 55 a8 mov -0x58(%ebp),%edx 9e1: 8d 0c 37 lea (%edi,%esi,1),%ecx 9e4: c1 f9 10 sar $0x10,%ecx 9e7: 8d 34 17 lea (%edi,%edx,1),%esi 9ea: 03 7d a0 add -0x60(%ebp),%edi 9ed: 8b 55 b8 mov -0x48(%ebp),%edx 9f0: c1 fe 10 sar $0x10,%esi 9f3: c1 ff 10 sar $0x10,%edi 9f6: 8b 3c bb mov (%ebx,%edi,4),%edi 9f9: 89 7d b4 mov %edi,-0x4c(%ebp) 9fc: 8b 0c 8b mov (%ebx,%ecx,4),%ecx 9ff: 66 0f 6e 04 b3 movd (%ebx,%esi,4),%xmm0 a04: 66 0f 6e 55 b4 movd -0x4c(%ebp),%xmm2 a09: 66 0f 62 c2 punpckldq %xmm2,%xmm0 a0d: 89 4d b4 mov %ecx,-0x4c(%ebp) a10: 66 0f 6e 14 93 movd (%ebx,%edx,4),%xmm2 a15: 66 0f 6e 65 b4 movd -0x4c(%ebp),%xmm4 a1a: 66 0f 62 d4 punpckldq %xmm4,%xmm2 a1e: f3 0f 7e c8 movq %xmm0,%xmm1 a22: f3 0f 7e c2 movq %xmm2,%xmm0 a26: 66 0f 6c c1 punpcklqdq %xmm1,%xmm0 a2a: f3 0f 6f d0 movdqu %xmm0,%xmm2 a2e: 0f 29 45 c8 movaps %xmm0,-0x38(%ebp) a32: 66 0f 6f ca movdqa %xmm2,%xmm1 a36: 66 0f 6f c2 movdqa %xmm2,%xmm0 a3a: 66 0f 74 ca pcmpeqb %xmm2,%xmm1 a3e: 66 0f 74 ca pcmpeqb %xmm2,%xmm1 a42: 66 0f d7 c9 pmovmskb %xmm1,%ecx a46: 81 e1 88 88 00 00 and $0x8888,%ecx a4c: 81 f9 88 88 00 00 cmp $0x8888,%ecx a52: 0f 84 88 00 00 00 je ae0 a58: 66 0f 6f ca movdqa %xmm2,%xmm1 a5c: 66 0f 74 cb pcmpeqb %xmm3,%xmm1 a60: 66 0f d7 c9 pmovmskb %xmm1,%ecx a64: 81 f9 ff ff 00 00 cmp $0xffff,%ecx a6a: 74 77 je ae3 a6c: 66 0f 6f e2 movdqa %xmm2,%xmm4 a70: 66 0f 68 c3 punpckhbw %xmm3,%xmm0 a74: 66 0f 6f 08 movdqa (%eax),%xmm1 a78: 66 0f 60 e3 punpcklbw %xmm3,%xmm4 a7c: 66 0f 6f f1 movdqa %xmm1,%xmm6 a80: f2 0f 70 f8 ff pshuflw $0xff,%xmm0,%xmm7 a85: 66 0f 6f 2d 40 00 00 movdqa 0x40,%xmm5 a8c: 00 a8d: f2 0f 70 d4 ff pshuflw $0xff,%xmm4,%xmm2 a92: f3 0f 70 ff ff pshufhw $0xff,%xmm7,%xmm7 a97: f3 0f 70 d2 ff pshufhw $0xff,%xmm2,%xmm2 a9c: 66 0f 60 f3 punpcklbw %xmm3,%xmm6 aa0: 66 0f ef d5 pxor %xmm5,%xmm2 aa4: 66 0f 68 cb punpckhbw %xmm3,%xmm1 aa8: 66 0f d5 d6 pmullw %xmm6,%xmm2 aac: 66 0f ef ef pxor %xmm7,%xmm5 ab0: 66 0f d5 cd pmullw %xmm5,%xmm1 ab4: 66 0f 6f 2d 30 00 00 movdqa 0x30,%xmm5 abb: 00 abc: 66 0f dd d5 paddusw %xmm5,%xmm2 ac0: 66 0f dd cd paddusw %xmm5,%xmm1 ac4: 66 0f 6f 2d 50 00 00 movdqa 0x50,%xmm5 acb: 00 acc: 66 0f e4 d5 pmulhuw %xmm5,%xmm2 ad0: 66 0f e4 cd pmulhuw %xmm5,%xmm1 ad4: 66 0f dc d4 paddusb %xmm4,%xmm2 ad8: 66 0f dc c1 paddusb %xmm1,%xmm0 adc: 66 0f 67 d0 packuswb %xmm0,%xmm2 ae0: 0f 29 10 movaps %xmm2,(%eax) ae3: 83 6d c0 04 subl $0x4,-0x40(%ebp) ae7: 83 c0 10 add $0x10,%eax aea: 83 7d c0 03 cmpl $0x3,-0x40(%ebp) aee: 0f 8f cc fe ff ff jg 9c0 af4: 8b 8d 64 ff ff ff mov -0x9c(%ebp),%ecx afa: 8b b5 60 ff ff ff mov -0xa0(%ebp),%esi b00: 8b 95 50 ff ff ff mov -0xb0(%ebp),%edx b06: 83 e9 04 sub $0x4,%ecx b09: 89 c8 mov %ecx,%eax b0b: 83 e1 03 and $0x3,%ecx b0e: c1 e8 02 shr $0x2,%eax b11: c1 e0 04 shl $0x4,%eax b14: 8d 44 30 10 lea 0x10(%eax,%esi,1),%eax b18: 85 c9 test %ecx,%ecx b1a: 0f 84 7b fa ff ff je 59b b20: 0f ef db pxor %mm3,%mm3 b23: 89 4d b4 mov %ecx,-0x4c(%ebp) b26: 89 5d c0 mov %ebx,-0x40(%ebp) b29: eb 15 jmp b40 b2b: 90 nop b2c: 8d 74 26 00 lea 0x0(%esi,%eiz,1),%esi b30: 89 fb mov %edi,%ebx b32: 89 18 mov %ebx,(%eax) b34: ff 4d b4 decl -0x4c(%ebp) b37: 0f 84 5b fa ff ff je 598 b3d: 83 c0 04 add $0x4,%eax b40: 8b 4d c0 mov -0x40(%ebp),%ecx b43: 89 d3 mov %edx,%ebx b45: 03 55 b0 add -0x50(%ebp),%edx b48: c1 fb 10 sar $0x10,%ebx b4b: 8b 38 mov (%eax),%edi b4d: 8b 1c 99 mov (%ecx,%ebx,4),%ebx b50: 89 de mov %ebx,%esi b52: c1 ee 18 shr $0x18,%esi b55: 89 f1 mov %esi,%ecx b57: fe c1 inc %cl b59: 74 d7 je b32 b5b: 85 db test %ebx,%ebx b5d: 74 d1 je b30 b5f: 66 0f 6e c3 movd %ebx,%xmm0 b63: 0f ef c0 pxor %mm0,%mm0 b66: 66 0f d6 45 b8 movq %xmm0,-0x48(%ebp) b6b: 0f 6f 4d b8 movq -0x48(%ebp),%mm1 b6f: 66 0f 6e c7 movd %edi,%xmm0 b73: 0f 60 c8 punpcklbw %mm0,%mm1 b76: 66 0f d6 45 b8 movq %xmm0,-0x48(%ebp) b7b: 0f 6f 55 b8 movq -0x48(%ebp),%mm2 b7f: 0f 60 d0 punpcklbw %mm0,%mm2 b82: 0f 70 c1 ff pshufw $0xff,%mm1,%mm0 b86: 0f ef 05 08 00 00 00 pxor 0x8,%mm0 b8d: 0f d5 c2 pmullw %mm2,%mm0 b90: 0f dd 05 00 00 00 00 paddusw 0x0,%mm0 b97: 0f e4 05 10 00 00 00 pmulhuw 0x10,%mm0 b9e: 0f dc c1 paddusb %mm1,%mm0 ba1: 0f 67 c3 packuswb %mm3,%mm0 ba4: 0f 7e 85 54 ff ff ff movd %mm0,-0xac(%ebp) bab: 0f 7e c3 movd %mm0,%ebx bae: eb 82 jmp b32 bb0: 39 c3 cmp %eax,%ebx bb2: 0f 83 e1 f7 ff ff jae 399 bb8: 8b 4d 34 mov 0x34(%ebp),%ecx bbb: 89 5d 80 mov %ebx,-0x80(%ebp) bbe: 29 d9 sub %ebx,%ecx bc0: 89 8d 74 ff ff ff mov %ecx,-0x8c(%ebp) bc6: e9 de f7 ff ff jmp 3a9 bcb: 8b 45 b0 mov -0x50(%ebp),%eax bce: 89 7d a0 mov %edi,-0x60(%ebp) bd1: 89 8d 58 ff ff ff mov %ecx,-0xa8(%ebp) bd7: 99 cltd bd8: 89 c3 mov %eax,%ebx bda: 83 c0 ff add $0xffffffff,%eax bdd: 89 d6 mov %edx,%esi bdf: 83 d2 ff adc $0xffffffff,%edx be2: 89 45 98 mov %eax,-0x68(%ebp) be5: 89 55 9c mov %edx,-0x64(%ebp) be8: 89 fa mov %edi,%edx bea: 8b 45 98 mov -0x68(%ebp),%eax bed: c1 fa 1f sar $0x1f,%edx bf0: 2b 45 a0 sub -0x60(%ebp),%eax bf3: 89 5c 24 08 mov %ebx,0x8(%esp) bf7: 89 55 a4 mov %edx,-0x5c(%ebp) bfa: 8b 55 9c mov -0x64(%ebp),%edx bfd: 1b 55 a4 sbb -0x5c(%ebp),%edx c00: 89 74 24 0c mov %esi,0xc(%esp) c04: 89 04 24 mov %eax,(%esp) c07: 89 54 24 04 mov %edx,0x4(%esp) c0b: e8 fc ff ff ff call c0c c10: 89 85 7c ff ff ff mov %eax,-0x84(%ebp) c16: 8b 45 34 mov 0x34(%ebp),%eax c19: 89 55 c0 mov %edx,-0x40(%ebp) c1c: 8b 8d 58 ff ff ff mov -0xa8(%ebp),%ecx c22: 99 cltd c23: 39 55 c0 cmp %edx,-0x40(%ebp) c26: 89 45 a8 mov %eax,-0x58(%ebp) c29: 89 55 ac mov %edx,-0x54(%ebp) c2c: 7c 29 jl c57 c2e: 7e 1f jle c4f c30: 8b 55 34 mov 0x34(%ebp),%edx c33: 8b 45 34 mov 0x34(%ebp),%eax c36: 0f af 55 b0 imul -0x50(%ebp),%edx c3a: c7 45 34 00 00 00 00 movl $0x0,0x34(%ebp) c41: 89 85 7c ff ff ff mov %eax,-0x84(%ebp) c47: 89 55 90 mov %edx,-0x70(%ebp) c4a: e9 eb f6 ff ff jmp 33a c4f: 39 85 7c ff ff ff cmp %eax,-0x84(%ebp) c55: 77 d9 ja c30 c57: 8b 95 7c ff ff ff mov -0x84(%ebp),%edx c5d: 29 55 34 sub %edx,0x34(%ebp) c60: 0f af 55 b0 imul -0x50(%ebp),%edx c64: 8b 85 7c ff ff ff mov -0x84(%ebp),%eax c6a: 89 45 a8 mov %eax,-0x58(%ebp) c6d: 89 55 90 mov %edx,-0x70(%ebp) c70: 99 cltd c71: 89 55 ac mov %edx,-0x54(%ebp) c74: e9 c1 f6 ff ff jmp 33a c79: 8d b4 26 00 00 00 00 lea 0x0(%esi,%eiz,1),%esi