Attachment 57157 Details for Bug 84520 – My first draft of an AMD64/MMX/SSE2 port of Eterm's shading functions.

[patch] My first draft of an AMD64/MMX/SSE2 port of Eterm's shading functions.

mmx_64_cmod.S (text/plain), 22.57 KB, created by Tres 'RiverRat' Melton on 2005-04-25 03:20:35 UTC

(hide)

Description:

Filename:

MIME Type:

Creator: Tres 'RiverRat' Melton

Created: 2005-04-25 03:20:35 UTC

Size: 22.57 KB

patch

obsolete

>/*
> * Copyright (C) 1997-2004, Michael Jennings
> *
> * Permission is hereby granted, free of charge, to any person obtaining a copy
> * of this software and associated documentation files (the "Software"), to
> * deal in the Software without restriction, including without limitation the
> * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
> * sell copies of the Software, and to permit persons to whom the Software is
> * furnished to do so, subject to the following conditions:
> *
> * The above copyright notice and this permission notice shall be included in
> * all copies of the Software, its documentation and marketing & publicity
> * materials, and acknowledgment shall be given in the documentation, materials
> * and software packages that this Software was used.
> *
> * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
> * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
> */
>
>#include "config.h"
>
>/* MMX routines for tinting XImages on AMD64 updated by Tres <tres@mindspring.com> */
>/* MMX routines for tinting XImages written by Willem Monsuwe <willem@stack.nl>    */
>
>/* AMD64 Function calling conventions:
> *   shade_ximage_xx_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
> *   shade_ximage_xx_mmx_64(void *data, int bpl, int w, int h, int rm, int gm, int bm);
> */
>
>/* Manuals used in this port:
> *	The Gnu Assembler
> *		http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html
> *	AMD64 Architecture Programmer's Manual Volume 1: Application Programming
> *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf
> *	AMD64 Architecture Programmer's Manual Volume 2: System Programming
> *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
> *	AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions
> *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf
> *	AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media Instructions
> *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf
> *	AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 Floating-Point Instructions
> *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf
> */
>
>#ifdef HAVE_MMX_64
>
>#define data	16(%rbp)
>#define bpl	24(%rbp)
>#define w	32(%rbp)
>#define h	40(%rbp)
>#define rm	48(%rbp)
>#define gm	56(%rbp)
>#define bm	64(%rbp)
>
>/*
>.global shade_ximage_15_mmx_64
>        .type shade_ximage_15_mmx_64,@function
>.global shade_ximage_16_mmx_64
>        .type shade_ximage_16_mmx_64,@function
>*/
>.global shade_ximage_32_mmx_64
>        .type shade_ximage_32_mmx_64,@function
>
>.bss
>.text
>.align 8
>
>#define ENTER                   \
>        pushq %rbp              ;\
>        movq %rsp, %rbp         ;\
>        pushq %rbx              ;\
>        pushq %rcx              ;\
>        pushq %rdx              ;\
>        pushq %rdi              ;\
>        pushq %rsi              ;\
>        movq data, %rsi         ;\
>        movq w, %rbx            ;\
>        movq h, %rdx
>
>#define LEAVE                   \
>4:                              ;\
>        emms                    ;\
>        popq %rsi               ;\
>        popq %rdi               ;\
>        popq %rdx               ;\
>        popq %rcx               ;\
>        popq %rbx               ;\
>        movq %rbp, %rsp         ;\
>        popq %rbp               ;\
>        ret
>
>#if 0 /* This comments out everything from here down to 'shade_ximage_32_mmx_64()' */
>	/* The AMD64 port is complete to here, and the last function  */
>shade_ximage_15_mmx_64:
>        ENTER
>
>        leal -6(%esi, %ebx, 2), %esi
>        negl %ebx
>        jz 5f
>
>        /* Setup multipliers */
>        movd rm, %mm5
>        movd gm, %mm6
>        movd bm, %mm7
>        punpcklwd %mm5, %mm5    /* 00 00 00 00 rm rm rm rm */
>        punpcklwd %mm6, %mm6    /* 00 00 00 00 gm gm gm gm */
>        punpcklwd %mm7, %mm7    /* 00 00 00 00 bm bm bm bm */
>        punpckldq %mm5, %mm5    /* rm rm rm rm rm rm rm rm */
>        punpckldq %mm6, %mm6    /* gm gm gm gm gm gm gm gm */
>        punpckldq %mm7, %mm7    /* bm bm bm bm bm bm bm bm */
>
>        cmpl $256, rm
>        jg shade_ximage_15_mmx_saturate
>        cmpl $256, gm
>        jg shade_ximage_15_mmx_saturate
>        cmpl $256, bm
>        jg shade_ximage_15_mmx_saturate
>
>1:      movl %ebx, %ecx
>        addl $3, %ecx
>        jns 3f
>2:
>        movq (%esi, %ecx, 2), %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $10, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $11, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $3, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* 00 0r */
>        pmulhw %mm6, %mm1       /* 00 0g */
>        pmulhw %mm7, %mm2       /* 00 0b */
>
>        psllw $10, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>        
>        movq %mm0, (%esi, %ecx, 2)
>
>        addl $4, %ecx
>        js 2b
>        jmp 4f
>3:
>        movw (%esi, %ecx, 2), %ax
>        movd %eax, %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $10, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $11, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $3, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* 00 0r */
>        pmulhw %mm6, %mm1       /* 00 0g */
>        pmulhw %mm7, %mm2       /* 00 0b */
>
>        psllw $10, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>
>        movd %mm0, %eax
>        movw %ax, (%esi, %ecx, 2)
>
>        incl %ecx
>4:
>        cmpl $2, %ecx
>        jng 3b
>
>        addl bpl, %esi
>        decl %edx
>        jnz 1b
>5:
>        LEAVE
>
>
>shade_ximage_15_mmx_saturate_64:
>
>        pcmpeqw %mm3, %mm3
>        psllw $5, %mm3          /* ff e0 ff e0 ff e0 ff e0 */
>
>1:      movl %ebx, %ecx
>        addl $3, %ecx
>        jns 3f
>2:
>        movq (%esi, %ecx, 2), %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $10, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $11, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $3, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* xx xr */
>        pmulhw %mm6, %mm1       /* xx xg */
>        pmulhw %mm7, %mm2       /* xx xb */
>
>        /* Saturate upper */
>        paddusw %mm3, %mm0      /* ff er */
>        paddusw %mm3, %mm1      /* ff eg */
>        paddusw %mm3, %mm2      /* ff eb */
>
>        psubw %mm3, %mm1        /* 00 0g */
>        psubw %mm3, %mm2        /* 00 0b */
>        
>        psllw $10, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>
>        movq %mm0, (%esi, %ecx, 2)
>
>        addl $4, %ecx
>        js 2b
>        jmp 4f
>3:
>        movw (%esi, %ecx, 2), %ax
>        movd %eax, %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $10, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $11, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $3, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* xx xr */
>        pmulhw %mm6, %mm1       /* xx xg */
>        pmulhw %mm7, %mm2       /* xx xb */
>
>        /* Saturate upper */
>        paddusw %mm3, %mm0      /* ff er */
>        paddusw %mm3, %mm1      /* ff eg */
>        paddusw %mm3, %mm2      /* ff eb */
>
>        psubw %mm3, %mm1        /* 00 0g */
>        psubw %mm3, %mm2        /* 00 0b */
>        
>        psllw $10, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>
>        movd %mm0, %eax
>        movw %ax, (%esi, %ecx, 2)
>
>        incl %ecx
>4:
>        cmpl $2, %ecx
>        jng 3b
>
>        addl bpl, %esi
>        decl %edx
>        jnz 1b
>5:
>        LEAVE
>
>
>shade_ximage_16_mmx_64:
>        ENTER
>
>        leal -6(%esi, %ebx, 2), %esi
>        negl %ebx
>        jz 5f
>
>        /* Setup multipliers */
>        movd rm, %mm5
>        movd gm, %mm6
>        movd bm, %mm7
>        punpcklwd %mm5, %mm5    /* 00 00 00 00 rm rm rm rm */
>        punpcklwd %mm6, %mm6    /* 00 00 00 00 gm gm gm gm */
>        punpcklwd %mm7, %mm7    /* 00 00 00 00 bm bm bm bm */
>        punpckldq %mm5, %mm5    /* rm rm rm rm rm rm rm rm */
>        punpckldq %mm6, %mm6    /* gm gm gm gm gm gm gm gm */
>        punpckldq %mm7, %mm7    /* bm bm bm bm bm bm bm bm */
>
>        cmpl $256, rm
>        jg shade_ximage_16_mmx_saturate
>        cmpl $256, gm
>        jg shade_ximage_16_mmx_saturate
>        cmpl $256, bm
>        jg shade_ximage_16_mmx_saturate
>
>1:      movl %ebx, %ecx
>        addl $3, %ecx
>        jns 3f
>2:
>        movq (%esi, %ecx, 2), %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $11, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $10, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $2, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* 00 0r */
>        pmulhw %mm6, %mm1       /* 00 0g */
>        pmulhw %mm7, %mm2       /* 00 0b */
>
>        psllw $11, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>        
>        movq %mm0, (%esi, %ecx, 2)
>
>        addl $4, %ecx
>        js 2b
>	jmp 4f
>3:
>        movw (%esi, %ecx, 2), %ax
>        movd %eax, %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $11, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $10, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $2, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* 00 0r */
>        pmulhw %mm6, %mm1       /* 00 0g */
>        pmulhw %mm7, %mm2       /* 00 0b */
>
>        psllw $11, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>
>        movd %mm0, %eax
>        movw %ax, (%esi, %ecx, 2)
>
>        incl %ecx
>4:
>        cmpl $2, %ecx
>        jng 3b
>
>        addl bpl, %esi
>        decl %edx
>        jnz 1b
>5:
>        LEAVE
>
>
>shade_ximage_16_mmx_saturate_64:
>
>        pcmpeqw %mm3, %mm3
>        movq %mm3, %mm4
>        psllw $5, %mm3          /* ff e0 ff e0 ff e0 ff e0 */
>        psllw $6, %mm4          /* ff c0 ff c0 ff c0 ff c0 */
>
>1:      movl %ebx, %ecx
>        addl $3, %ecx
>        jns 3f
>2:
>        movq (%esi, %ecx, 2), %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $11, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $10, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $2, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* xx xr */
>        pmulhw %mm6, %mm1       /* xx xg */
>        pmulhw %mm7, %mm2       /* xx xb */
>
>        /* Saturate upper */
>        paddusw %mm3, %mm0      /* ff er */
>        paddusw %mm4, %mm1      /* ff cg */
>        paddusw %mm3, %mm2      /* ff eb */
>
>        psubw %mm4, %mm1        /* 00 0g */
>        psubw %mm3, %mm2        /* 00 0b */
>        
>        psllw $11, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>
>        movq %mm0, (%esi, %ecx, 2)
>
>        addl $4, %ecx
>        js 2b
>        jmp 4f
>3:
>        movw (%esi, %ecx, 2), %ax
>        movd %eax, %mm0
>
>        movq %mm0, %mm1         /* rg gb */
>        movq %mm0, %mm2         /* rg gb */
>        psrlw $5, %mm1          /* 0r rg */
>        psrlw $11, %mm0         /* 00 0r */
>        psllw $11, %mm2         /* b0 00 */
>        psllw $10, %mm1         /* g0 00 */
>        psllw $8, %mm0          /* 0r 00 */
>        psrlw $2, %mm1          /* 0g 00 */
>        psrlw $3, %mm2          /* 0b 00 */
>
>        pmulhw %mm5, %mm0       /* xx xr */
>        pmulhw %mm6, %mm1       /* xx xg */
>        pmulhw %mm7, %mm2       /* xx xb */
>
>        /* Saturate upper */
>        paddusw %mm3, %mm0      /* ff er */
>        paddusw %mm4, %mm1      /* ff cg */
>        paddusw %mm3, %mm2      /* ff eb */
>
>        psubw %mm4, %mm1        /* 00 0g */
>        psubw %mm3, %mm2        /* 00 0b */
>        
>        psllw $11, %mm0         /* r0 00 */
>        psllw $5, %mm1          /* 0g g0 */
>        por %mm2, %mm0          /* r0 0b */
>        por %mm1, %mm0          /* rg gb */
>
>        movd %mm0, %eax
>        movw %ax, (%esi, %ecx, 2)
>
>        incl %ecx
>4:
>        cmpl $2, %ecx
>        jng 3b
>
>        addl bpl, %esi
>        decl %edx
>        jnz 1b
>5:
>        LEAVE
>#endif  /* The commented out section  */
>
>
>shade_ximage_32_mmx_64:
>        ENTER									/* rsi* = data, rbx = w, rdx = h                                                                 */
>
>        leaq (%rsi, %rbx, 4), %rsi						/* From A64_General_Purpose_and_System_Instructions (p. 182)                                     */
>										/* Intel syntax section:[base + index*scale + disp]  (used by AMD manuals)                       */
>										/* AT&T  syntax section:disp(base, index, scale)     (used by gas/gcc)                           */
>        									/* Load Effective Address of (rsi + (rbx * size)) into rsi                                       */
>        									/* 32 bits per pixel means a multiplier of 4.                                                    */
>
>        negq %rbx								/* two's compliment negation of rbx and sets the Zero Flag based on the results                  */
>        									/* From A64_General_Purpose_and_System_Instructions (p. 212)                                     */
>
>        jz 3f									/* Jump to label 3 forward on Zero                                                               */
>        									/* Basically if width = 0 blowout                                                                */
>        									/* I don't understand why the height isn't checked (shouldn't matter, zero loop iterations)      */
>
>        movq rm, %mm4								/* move red (green & blue) (32 or 64 bits) into mm4 w/ zero extension to 128bits                 */
>        movq gm, %mm5								/* FIXME: rm is 64 bits but needs treated as a 32 bit number (WRONG)                             */
>        movq bm, %mm6								/* RGB's are 8 bit values. regardless of them coming in in 32/64 bit they are zero extended      */
>
>        psllq $32, %mm4								/* Packed Shift Left Logical Quad words (left shift mm4 32bits twice, once for each 64bit value) */
>        									/* From A64_128bit_Media_Programming (p. 328)                                                    */
>        psllq $16, %mm5								/* left shift mm5 16 bits twice, once for each 64bit value                                       */
>        									/* the most significant 64 bits are zeros so they don't matter                                   */
>
>        por %mm6, %mm4								/* mm4 |= mm6                                                                                    */
>        									/* From A64_128bit_Media_Programming (p. 309)                                                    */
>        por %mm5, %mm4								/* mm4 |= mm5                                                                                    */
>        									/* mm4 now contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 00 : 00 0r 0g 0b color modifiers       */
>
>        pcmpeqw %mm6, %mm6							/* Packed Compare Equal Words                                                                    */
>        									/* From A64_128bit_Media_Programming (p. 276)                                                    */
>        									/* This sets mm6 to 128 1's (since mm6 = mm6)                                                    */
>
>        psllw $15, %mm6                 /* 80 00 80 00 80 00 80 00 */		/* Packed Shift Left Logical Words                                                               */
>        									/* From A64_128bit_Media_Programming (p. 330)                                                    */
>        									/* This sets 8 16 bit values of  1000 0000 0000 0000 in the 128 bit word                         */
>
>        movq %mm6, %mm5								/* save the mm6 value in mm5                                                                     */
>
>        pmulhw %mm4, %mm5               /* Get correction factor */		/* Packed Multiply High Signed Word                                                              */
>        									/* mm4 = ( mm4 * mm5 ) >> 16  (8 times, once for each 16bit value)                               */
>        									/* For each (color modifier * 80 00) >> 16 =                                                     */
>        									/*   (( cm << 15 ) >> 16 )  = cm >> 1                                                            */
>        									/*  bit shift each 16 bit color modifier 1 to the right and fill w/ ones                         */
>1:
>        movq %rbx, %rcx								/* tweaked to handle 64 bit values                                                               */
>        									/* Load the counting register (rcx) with the width of the window to shade                        */
>2:
>        movd (%rsi, %rcx, 4), %mm1      /* 00 rr gg bb */			/* sets mm1 to the 32bit color in the image map (data)                                           */
>        									/* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits   */
>        									/* only move 32 bits with movd so we don't get two pixels worth of colors                        */
>
>        pxor %mm0, %mm0								/* 128bit exclusive or (sets mm0 to 0)                                                           */
>
>        punpcklbw %mm1, %mm0            /* 00 00 rr 00 gg 00 bb 00 */		/* Unpack and interleave low bytes                                                               */
>        									/* For each color of the pixel expand to 16 bits and shift left 8 bits                           */
>        									/* From A64_128bit_Media_Programming (p. 374)                                                    */
>        									/* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first)        */
>
>        pxor %mm6, %mm0                 /* Flip sign */				/* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times)     */
>
>        pmulhw %mm4, %mm0               /* 00 00 xx rr xx gg xx bb */		/* Package Multiply High Signed Word  (an SSE2 instruction) 128bit     mm0=color  mm4=cm         */
>        									/* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0                            */
>        									/*    and the high 16 bits of the result replace the 16 bits used from mm0                       */
>        									/* For (( each 16 bit color * each 16 bit color modifier ) >> 16 )                               */
>
>        psubw %mm5, %mm0                /* Correct range */			/* Packed Subtract Words                                                                         */
>        									/* From A64_128bit_Media_Programming (p. 364)                                                    */
>        									/* mm0=modified color  mm5=corrected color modifier. mm0 = ( mm0 - mm5 )                         */
>        									/* 16 bit corrected modified color = ( modified color - corrected color modifier )               */
>
>        packuswb %mm0, %mm0             /* 00 rr gg bb 00 rr gg bb */		/* Pack with Saturation Signed Word to Unsigned Byte                                             */
>        									/* From A64_128bit_Media_Programming (p. 246)                                                    */
>        									/* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0                                    */
>
>        movd %mm0, (%rsi, %rcx, 4)						/* puts the new 32 bit color value back into the data (image map)                                */
>        									/* 32 bit color is still a double word so movd stays movd                                        */
>
>        incq %rcx								/* Increment the count register (more pixels left)                                               */
>        jnz 2b									/* Jump backwards to label 2 on NOT zero (more pixels left)                                      */
>
>        addq bpl, %rsi								/* Add bytes per line to the data pointer (advance the pointer to the next line)                 */
>        decq %rdx								/* Decrement the dx register (row count)                                                         */
>        jnz 1b									/* Jump backwards to label 1 if not zero (more rows left)                                        */
>3:
>        LEAVE									/* macro ending the function                                                                     */
>
>#endif /* HAVE_MMX_64 */
>
>
>#ifdef MY_CONCERNS
>/*
> * Concerns:
> *	The AMD books say that some of the ops are SSE2 but they are in a MMX set of routines.
> *		Verify that MMX = 64 bit extensions and SSE2 = 128 bit extensions. (TRUE)
> *		The book uses 'xmm1' SSE2 and 128 bit so maybe 'xm1' is MMX/64 bit (FALSE)
> *	The movd instructions for the pixel array work on 32 bits or a double word.
> *		Investigate the possibility of handling 2 pixels (64 bits) at a time on AMD64 with movq
> *	Do we need to use WORDS_BIGENDIAN to avoid the bad blue acid trip fixed in 0.9.3-r1?
> *
> */
>#endif 
> 
>
>

Actions: View | Diff

Attachments on bug 84520: 52947 | 52948 | 57157