/* * Copyright (C) 1997-2004, Michael Jennings * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies of the Software, its documentation and marketing & publicity * materials, and acknowledgment shall be given in the documentation, materials * and software packages that this Software was used. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "config.h" /* MMX routines for tinting XImages on AMD64 updated by Tres */ /* MMX routines for tinting XImages written by Willem Monsuwe */ /* AMD64 Function calling conventions: * shade_ximage_xx_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm); * shade_ximage_xx_mmx_64(void *data, int bpl, int w, int h, int rm, int gm, int bm); */ /* Manuals used in this port: * The Gnu Assembler * http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html * AMD64 Architecture Programmer's Manual Volume 1: Application Programming * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf * AMD64 Architecture Programmer's Manual Volume 2: System Programming * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf * AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf * AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media Instructions * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf * AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 Floating-Point Instructions * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf */ #ifdef HAVE_MMX_64 #define data 16(%rbp) #define bpl 24(%rbp) #define w 32(%rbp) #define h 40(%rbp) #define rm 48(%rbp) #define gm 56(%rbp) #define bm 64(%rbp) /* .global shade_ximage_15_mmx_64 .type shade_ximage_15_mmx_64,@function .global shade_ximage_16_mmx_64 .type shade_ximage_16_mmx_64,@function */ .global shade_ximage_32_mmx_64 .type shade_ximage_32_mmx_64,@function .bss .text .align 8 #define ENTER \ pushq %rbp ;\ movq %rsp, %rbp ;\ pushq %rbx ;\ pushq %rcx ;\ pushq %rdx ;\ pushq %rdi ;\ pushq %rsi ;\ movq data, %rsi ;\ movq w, %rbx ;\ movq h, %rdx #define LEAVE \ 4: ;\ emms ;\ popq %rsi ;\ popq %rdi ;\ popq %rdx ;\ popq %rcx ;\ popq %rbx ;\ movq %rbp, %rsp ;\ popq %rbp ;\ ret #if 0 /* This comments out everything from here down to 'shade_ximage_32_mmx_64()' */ /* The AMD64 port is complete to here, and the last function */ shade_ximage_15_mmx_64: ENTER leal -6(%esi, %ebx, 2), %esi negl %ebx jz 5f /* Setup multipliers */ movd rm, %mm5 movd gm, %mm6 movd bm, %mm7 punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ cmpl $256, rm jg shade_ximage_15_mmx_saturate cmpl $256, gm jg shade_ximage_15_mmx_saturate cmpl $256, bm jg shade_ximage_15_mmx_saturate 1: movl %ebx, %ecx addl $3, %ecx jns 3f 2: movq (%esi, %ecx, 2), %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $10, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $11, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $3, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* 00 0r */ pmulhw %mm6, %mm1 /* 00 0g */ pmulhw %mm7, %mm2 /* 00 0b */ psllw $10, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movq %mm0, (%esi, %ecx, 2) addl $4, %ecx js 2b jmp 4f 3: movw (%esi, %ecx, 2), %ax movd %eax, %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $10, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $11, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $3, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* 00 0r */ pmulhw %mm6, %mm1 /* 00 0g */ pmulhw %mm7, %mm2 /* 00 0b */ psllw $10, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movd %mm0, %eax movw %ax, (%esi, %ecx, 2) incl %ecx 4: cmpl $2, %ecx jng 3b addl bpl, %esi decl %edx jnz 1b 5: LEAVE shade_ximage_15_mmx_saturate_64: pcmpeqw %mm3, %mm3 psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ 1: movl %ebx, %ecx addl $3, %ecx jns 3f 2: movq (%esi, %ecx, 2), %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $10, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $11, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $3, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* xx xr */ pmulhw %mm6, %mm1 /* xx xg */ pmulhw %mm7, %mm2 /* xx xb */ /* Saturate upper */ paddusw %mm3, %mm0 /* ff er */ paddusw %mm3, %mm1 /* ff eg */ paddusw %mm3, %mm2 /* ff eb */ psubw %mm3, %mm1 /* 00 0g */ psubw %mm3, %mm2 /* 00 0b */ psllw $10, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movq %mm0, (%esi, %ecx, 2) addl $4, %ecx js 2b jmp 4f 3: movw (%esi, %ecx, 2), %ax movd %eax, %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $10, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $11, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $3, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* xx xr */ pmulhw %mm6, %mm1 /* xx xg */ pmulhw %mm7, %mm2 /* xx xb */ /* Saturate upper */ paddusw %mm3, %mm0 /* ff er */ paddusw %mm3, %mm1 /* ff eg */ paddusw %mm3, %mm2 /* ff eb */ psubw %mm3, %mm1 /* 00 0g */ psubw %mm3, %mm2 /* 00 0b */ psllw $10, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movd %mm0, %eax movw %ax, (%esi, %ecx, 2) incl %ecx 4: cmpl $2, %ecx jng 3b addl bpl, %esi decl %edx jnz 1b 5: LEAVE shade_ximage_16_mmx_64: ENTER leal -6(%esi, %ebx, 2), %esi negl %ebx jz 5f /* Setup multipliers */ movd rm, %mm5 movd gm, %mm6 movd bm, %mm7 punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ cmpl $256, rm jg shade_ximage_16_mmx_saturate cmpl $256, gm jg shade_ximage_16_mmx_saturate cmpl $256, bm jg shade_ximage_16_mmx_saturate 1: movl %ebx, %ecx addl $3, %ecx jns 3f 2: movq (%esi, %ecx, 2), %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $11, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $10, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $2, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* 00 0r */ pmulhw %mm6, %mm1 /* 00 0g */ pmulhw %mm7, %mm2 /* 00 0b */ psllw $11, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movq %mm0, (%esi, %ecx, 2) addl $4, %ecx js 2b jmp 4f 3: movw (%esi, %ecx, 2), %ax movd %eax, %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $11, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $10, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $2, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* 00 0r */ pmulhw %mm6, %mm1 /* 00 0g */ pmulhw %mm7, %mm2 /* 00 0b */ psllw $11, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movd %mm0, %eax movw %ax, (%esi, %ecx, 2) incl %ecx 4: cmpl $2, %ecx jng 3b addl bpl, %esi decl %edx jnz 1b 5: LEAVE shade_ximage_16_mmx_saturate_64: pcmpeqw %mm3, %mm3 movq %mm3, %mm4 psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ psllw $6, %mm4 /* ff c0 ff c0 ff c0 ff c0 */ 1: movl %ebx, %ecx addl $3, %ecx jns 3f 2: movq (%esi, %ecx, 2), %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $11, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $10, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $2, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* xx xr */ pmulhw %mm6, %mm1 /* xx xg */ pmulhw %mm7, %mm2 /* xx xb */ /* Saturate upper */ paddusw %mm3, %mm0 /* ff er */ paddusw %mm4, %mm1 /* ff cg */ paddusw %mm3, %mm2 /* ff eb */ psubw %mm4, %mm1 /* 00 0g */ psubw %mm3, %mm2 /* 00 0b */ psllw $11, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movq %mm0, (%esi, %ecx, 2) addl $4, %ecx js 2b jmp 4f 3: movw (%esi, %ecx, 2), %ax movd %eax, %mm0 movq %mm0, %mm1 /* rg gb */ movq %mm0, %mm2 /* rg gb */ psrlw $5, %mm1 /* 0r rg */ psrlw $11, %mm0 /* 00 0r */ psllw $11, %mm2 /* b0 00 */ psllw $10, %mm1 /* g0 00 */ psllw $8, %mm0 /* 0r 00 */ psrlw $2, %mm1 /* 0g 00 */ psrlw $3, %mm2 /* 0b 00 */ pmulhw %mm5, %mm0 /* xx xr */ pmulhw %mm6, %mm1 /* xx xg */ pmulhw %mm7, %mm2 /* xx xb */ /* Saturate upper */ paddusw %mm3, %mm0 /* ff er */ paddusw %mm4, %mm1 /* ff cg */ paddusw %mm3, %mm2 /* ff eb */ psubw %mm4, %mm1 /* 00 0g */ psubw %mm3, %mm2 /* 00 0b */ psllw $11, %mm0 /* r0 00 */ psllw $5, %mm1 /* 0g g0 */ por %mm2, %mm0 /* r0 0b */ por %mm1, %mm0 /* rg gb */ movd %mm0, %eax movw %ax, (%esi, %ecx, 2) incl %ecx 4: cmpl $2, %ecx jng 3b addl bpl, %esi decl %edx jnz 1b 5: LEAVE #endif /* The commented out section */ shade_ximage_32_mmx_64: ENTER /* rsi* = data, rbx = w, rdx = h */ leaq (%rsi, %rbx, 4), %rsi /* From A64_General_Purpose_and_System_Instructions (p. 182) */ /* Intel syntax section:[base + index*scale + disp] (used by AMD manuals) */ /* AT&T syntax section:disp(base, index, scale) (used by gas/gcc) */ /* Load Effective Address of (rsi + (rbx * size)) into rsi */ /* 32 bits per pixel means a multiplier of 4. */ negq %rbx /* two's compliment negation of rbx and sets the Zero Flag based on the results */ /* From A64_General_Purpose_and_System_Instructions (p. 212) */ jz 3f /* Jump to label 3 forward on Zero */ /* Basically if width = 0 blowout */ /* I don't understand why the height isn't checked (shouldn't matter, zero loop iterations) */ movq rm, %mm4 /* move red (green & blue) (32 or 64 bits) into mm4 w/ zero extension to 128bits */ movq gm, %mm5 /* FIXME: rm is 64 bits but needs treated as a 32 bit number (WRONG) */ movq bm, %mm6 /* RGB's are 8 bit values. regardless of them coming in in 32/64 bit they are zero extended */ psllq $32, %mm4 /* Packed Shift Left Logical Quad words (left shift mm4 32bits twice, once for each 64bit value) */ /* From A64_128bit_Media_Programming (p. 328) */ psllq $16, %mm5 /* left shift mm5 16 bits twice, once for each 64bit value */ /* the most significant 64 bits are zeros so they don't matter */ por %mm6, %mm4 /* mm4 |= mm6 */ /* From A64_128bit_Media_Programming (p. 309) */ por %mm5, %mm4 /* mm4 |= mm5 */ /* mm4 now contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 00 : 00 0r 0g 0b color modifiers */ pcmpeqw %mm6, %mm6 /* Packed Compare Equal Words */ /* From A64_128bit_Media_Programming (p. 276) */ /* This sets mm6 to 128 1's (since mm6 = mm6) */ psllw $15, %mm6 /* 80 00 80 00 80 00 80 00 */ /* Packed Shift Left Logical Words */ /* From A64_128bit_Media_Programming (p. 330) */ /* This sets 8 16 bit values of 1000 0000 0000 0000 in the 128 bit word */ movq %mm6, %mm5 /* save the mm6 value in mm5 */ pmulhw %mm4, %mm5 /* Get correction factor */ /* Packed Multiply High Signed Word */ /* mm4 = ( mm4 * mm5 ) >> 16 (8 times, once for each 16bit value) */ /* For each (color modifier * 80 00) >> 16 = */ /* (( cm << 15 ) >> 16 ) = cm >> 1 */ /* bit shift each 16 bit color modifier 1 to the right and fill w/ ones */ 1: movq %rbx, %rcx /* tweaked to handle 64 bit values */ /* Load the counting register (rcx) with the width of the window to shade */ 2: movd (%rsi, %rcx, 4), %mm1 /* 00 rr gg bb */ /* sets mm1 to the 32bit color in the image map (data) */ /* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits */ /* only move 32 bits with movd so we don't get two pixels worth of colors */ pxor %mm0, %mm0 /* 128bit exclusive or (sets mm0 to 0) */ punpcklbw %mm1, %mm0 /* 00 00 rr 00 gg 00 bb 00 */ /* Unpack and interleave low bytes */ /* For each color of the pixel expand to 16 bits and shift left 8 bits */ /* From A64_128bit_Media_Programming (p. 374) */ /* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first) */ pxor %mm6, %mm0 /* Flip sign */ /* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times) */ pmulhw %mm4, %mm0 /* 00 00 xx rr xx gg xx bb */ /* Package Multiply High Signed Word (an SSE2 instruction) 128bit mm0=color mm4=cm */ /* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0 */ /* and the high 16 bits of the result replace the 16 bits used from mm0 */ /* For (( each 16 bit color * each 16 bit color modifier ) >> 16 ) */ psubw %mm5, %mm0 /* Correct range */ /* Packed Subtract Words */ /* From A64_128bit_Media_Programming (p. 364) */ /* mm0=modified color mm5=corrected color modifier. mm0 = ( mm0 - mm5 ) */ /* 16 bit corrected modified color = ( modified color - corrected color modifier ) */ packuswb %mm0, %mm0 /* 00 rr gg bb 00 rr gg bb */ /* Pack with Saturation Signed Word to Unsigned Byte */ /* From A64_128bit_Media_Programming (p. 246) */ /* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0 */ movd %mm0, (%rsi, %rcx, 4) /* puts the new 32 bit color value back into the data (image map) */ /* 32 bit color is still a double word so movd stays movd */ incq %rcx /* Increment the count register (more pixels left) */ jnz 2b /* Jump backwards to label 2 on NOT zero (more pixels left) */ addq bpl, %rsi /* Add bytes per line to the data pointer (advance the pointer to the next line) */ decq %rdx /* Decrement the dx register (row count) */ jnz 1b /* Jump backwards to label 1 if not zero (more rows left) */ 3: LEAVE /* macro ending the function */ #endif /* HAVE_MMX_64 */ #ifdef MY_CONCERNS /* * Concerns: * The AMD books say that some of the ops are SSE2 but they are in a MMX set of routines. * Verify that MMX = 64 bit extensions and SSE2 = 128 bit extensions. (TRUE) * The book uses 'xmm1' SSE2 and 128 bit so maybe 'xm1' is MMX/64 bit (FALSE) * The movd instructions for the pixel array work on 32 bits or a double word. * Investigate the possibility of handling 2 pixels (64 bits) at a time on AMD64 with movq * Do we need to use WORDS_BIGENDIAN to avoid the bad blue acid trip fixed in 0.9.3-r1? * */ #endif