Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 57157 Details for
Bug 84520
x11-terms/eterm-0.9.3 doesn't honor SIMD extensions on amd64
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
My first draft of an AMD64/MMX/SSE2 port of Eterm's shading functions.
mmx_64_cmod.S (text/plain), 22.57 KB, created by
Tres 'RiverRat' Melton
on 2005-04-25 03:20:35 UTC
(
hide
)
Description:
My first draft of an AMD64/MMX/SSE2 port of Eterm's shading functions.
Filename:
MIME Type:
Creator:
Tres 'RiverRat' Melton
Created:
2005-04-25 03:20:35 UTC
Size:
22.57 KB
patch
obsolete
>/* > * Copyright (C) 1997-2004, Michael Jennings > * > * Permission is hereby granted, free of charge, to any person obtaining a copy > * of this software and associated documentation files (the "Software"), to > * deal in the Software without restriction, including without limitation the > * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or > * sell copies of the Software, and to permit persons to whom the Software is > * furnished to do so, subject to the following conditions: > * > * The above copyright notice and this permission notice shall be included in > * all copies of the Software, its documentation and marketing & publicity > * materials, and acknowledgment shall be given in the documentation, materials > * and software packages that this Software was used. > * > * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER > * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN > * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. > */ > >#include "config.h" > >/* MMX routines for tinting XImages on AMD64 updated by Tres <tres@mindspring.com> */ >/* MMX routines for tinting XImages written by Willem Monsuwe <willem@stack.nl> */ > >/* AMD64 Function calling conventions: > * shade_ximage_xx_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm); > * shade_ximage_xx_mmx_64(void *data, int bpl, int w, int h, int rm, int gm, int bm); > */ > >/* Manuals used in this port: > * The Gnu Assembler > * http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html > * AMD64 Architecture Programmer's Manual Volume 1: Application Programming > * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf > * AMD64 Architecture Programmer's Manual Volume 2: System Programming > * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf > * AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions > * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf > * AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media Instructions > * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf > * AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 Floating-Point Instructions > * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf > */ > >#ifdef HAVE_MMX_64 > >#define data 16(%rbp) >#define bpl 24(%rbp) >#define w 32(%rbp) >#define h 40(%rbp) >#define rm 48(%rbp) >#define gm 56(%rbp) >#define bm 64(%rbp) > >/* >.global shade_ximage_15_mmx_64 > .type shade_ximage_15_mmx_64,@function >.global shade_ximage_16_mmx_64 > .type shade_ximage_16_mmx_64,@function >*/ >.global shade_ximage_32_mmx_64 > .type shade_ximage_32_mmx_64,@function > >.bss >.text >.align 8 > >#define ENTER \ > pushq %rbp ;\ > movq %rsp, %rbp ;\ > pushq %rbx ;\ > pushq %rcx ;\ > pushq %rdx ;\ > pushq %rdi ;\ > pushq %rsi ;\ > movq data, %rsi ;\ > movq w, %rbx ;\ > movq h, %rdx > >#define LEAVE \ >4: ;\ > emms ;\ > popq %rsi ;\ > popq %rdi ;\ > popq %rdx ;\ > popq %rcx ;\ > popq %rbx ;\ > movq %rbp, %rsp ;\ > popq %rbp ;\ > ret > >#if 0 /* This comments out everything from here down to 'shade_ximage_32_mmx_64()' */ > /* The AMD64 port is complete to here, and the last function */ >shade_ximage_15_mmx_64: > ENTER > > leal -6(%esi, %ebx, 2), %esi > negl %ebx > jz 5f > > /* Setup multipliers */ > movd rm, %mm5 > movd gm, %mm6 > movd bm, %mm7 > punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ > punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ > punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ > punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ > punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ > punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ > > cmpl $256, rm > jg shade_ximage_15_mmx_saturate > cmpl $256, gm > jg shade_ximage_15_mmx_saturate > cmpl $256, bm > jg shade_ximage_15_mmx_saturate > >1: movl %ebx, %ecx > addl $3, %ecx > jns 3f >2: > movq (%esi, %ecx, 2), %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $10, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $11, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $3, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* 00 0r */ > pmulhw %mm6, %mm1 /* 00 0g */ > pmulhw %mm7, %mm2 /* 00 0b */ > > psllw $10, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movq %mm0, (%esi, %ecx, 2) > > addl $4, %ecx > js 2b > jmp 4f >3: > movw (%esi, %ecx, 2), %ax > movd %eax, %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $10, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $11, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $3, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* 00 0r */ > pmulhw %mm6, %mm1 /* 00 0g */ > pmulhw %mm7, %mm2 /* 00 0b */ > > psllw $10, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movd %mm0, %eax > movw %ax, (%esi, %ecx, 2) > > incl %ecx >4: > cmpl $2, %ecx > jng 3b > > addl bpl, %esi > decl %edx > jnz 1b >5: > LEAVE > > >shade_ximage_15_mmx_saturate_64: > > pcmpeqw %mm3, %mm3 > psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ > >1: movl %ebx, %ecx > addl $3, %ecx > jns 3f >2: > movq (%esi, %ecx, 2), %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $10, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $11, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $3, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* xx xr */ > pmulhw %mm6, %mm1 /* xx xg */ > pmulhw %mm7, %mm2 /* xx xb */ > > /* Saturate upper */ > paddusw %mm3, %mm0 /* ff er */ > paddusw %mm3, %mm1 /* ff eg */ > paddusw %mm3, %mm2 /* ff eb */ > > psubw %mm3, %mm1 /* 00 0g */ > psubw %mm3, %mm2 /* 00 0b */ > > psllw $10, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movq %mm0, (%esi, %ecx, 2) > > addl $4, %ecx > js 2b > jmp 4f >3: > movw (%esi, %ecx, 2), %ax > movd %eax, %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $10, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $11, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $3, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* xx xr */ > pmulhw %mm6, %mm1 /* xx xg */ > pmulhw %mm7, %mm2 /* xx xb */ > > /* Saturate upper */ > paddusw %mm3, %mm0 /* ff er */ > paddusw %mm3, %mm1 /* ff eg */ > paddusw %mm3, %mm2 /* ff eb */ > > psubw %mm3, %mm1 /* 00 0g */ > psubw %mm3, %mm2 /* 00 0b */ > > psllw $10, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movd %mm0, %eax > movw %ax, (%esi, %ecx, 2) > > incl %ecx >4: > cmpl $2, %ecx > jng 3b > > addl bpl, %esi > decl %edx > jnz 1b >5: > LEAVE > > >shade_ximage_16_mmx_64: > ENTER > > leal -6(%esi, %ebx, 2), %esi > negl %ebx > jz 5f > > /* Setup multipliers */ > movd rm, %mm5 > movd gm, %mm6 > movd bm, %mm7 > punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ > punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ > punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ > punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ > punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ > punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ > > cmpl $256, rm > jg shade_ximage_16_mmx_saturate > cmpl $256, gm > jg shade_ximage_16_mmx_saturate > cmpl $256, bm > jg shade_ximage_16_mmx_saturate > >1: movl %ebx, %ecx > addl $3, %ecx > jns 3f >2: > movq (%esi, %ecx, 2), %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $11, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $10, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $2, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* 00 0r */ > pmulhw %mm6, %mm1 /* 00 0g */ > pmulhw %mm7, %mm2 /* 00 0b */ > > psllw $11, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movq %mm0, (%esi, %ecx, 2) > > addl $4, %ecx > js 2b > jmp 4f >3: > movw (%esi, %ecx, 2), %ax > movd %eax, %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $11, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $10, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $2, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* 00 0r */ > pmulhw %mm6, %mm1 /* 00 0g */ > pmulhw %mm7, %mm2 /* 00 0b */ > > psllw $11, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movd %mm0, %eax > movw %ax, (%esi, %ecx, 2) > > incl %ecx >4: > cmpl $2, %ecx > jng 3b > > addl bpl, %esi > decl %edx > jnz 1b >5: > LEAVE > > >shade_ximage_16_mmx_saturate_64: > > pcmpeqw %mm3, %mm3 > movq %mm3, %mm4 > psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ > psllw $6, %mm4 /* ff c0 ff c0 ff c0 ff c0 */ > >1: movl %ebx, %ecx > addl $3, %ecx > jns 3f >2: > movq (%esi, %ecx, 2), %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $11, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $10, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $2, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* xx xr */ > pmulhw %mm6, %mm1 /* xx xg */ > pmulhw %mm7, %mm2 /* xx xb */ > > /* Saturate upper */ > paddusw %mm3, %mm0 /* ff er */ > paddusw %mm4, %mm1 /* ff cg */ > paddusw %mm3, %mm2 /* ff eb */ > > psubw %mm4, %mm1 /* 00 0g */ > psubw %mm3, %mm2 /* 00 0b */ > > psllw $11, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movq %mm0, (%esi, %ecx, 2) > > addl $4, %ecx > js 2b > jmp 4f >3: > movw (%esi, %ecx, 2), %ax > movd %eax, %mm0 > > movq %mm0, %mm1 /* rg gb */ > movq %mm0, %mm2 /* rg gb */ > psrlw $5, %mm1 /* 0r rg */ > psrlw $11, %mm0 /* 00 0r */ > psllw $11, %mm2 /* b0 00 */ > psllw $10, %mm1 /* g0 00 */ > psllw $8, %mm0 /* 0r 00 */ > psrlw $2, %mm1 /* 0g 00 */ > psrlw $3, %mm2 /* 0b 00 */ > > pmulhw %mm5, %mm0 /* xx xr */ > pmulhw %mm6, %mm1 /* xx xg */ > pmulhw %mm7, %mm2 /* xx xb */ > > /* Saturate upper */ > paddusw %mm3, %mm0 /* ff er */ > paddusw %mm4, %mm1 /* ff cg */ > paddusw %mm3, %mm2 /* ff eb */ > > psubw %mm4, %mm1 /* 00 0g */ > psubw %mm3, %mm2 /* 00 0b */ > > psllw $11, %mm0 /* r0 00 */ > psllw $5, %mm1 /* 0g g0 */ > por %mm2, %mm0 /* r0 0b */ > por %mm1, %mm0 /* rg gb */ > > movd %mm0, %eax > movw %ax, (%esi, %ecx, 2) > > incl %ecx >4: > cmpl $2, %ecx > jng 3b > > addl bpl, %esi > decl %edx > jnz 1b >5: > LEAVE >#endif /* The commented out section */ > > >shade_ximage_32_mmx_64: > ENTER /* rsi* = data, rbx = w, rdx = h */ > > leaq (%rsi, %rbx, 4), %rsi /* From A64_General_Purpose_and_System_Instructions (p. 182) */ > /* Intel syntax section:[base + index*scale + disp] (used by AMD manuals) */ > /* AT&T syntax section:disp(base, index, scale) (used by gas/gcc) */ > /* Load Effective Address of (rsi + (rbx * size)) into rsi */ > /* 32 bits per pixel means a multiplier of 4. */ > > negq %rbx /* two's compliment negation of rbx and sets the Zero Flag based on the results */ > /* From A64_General_Purpose_and_System_Instructions (p. 212) */ > > jz 3f /* Jump to label 3 forward on Zero */ > /* Basically if width = 0 blowout */ > /* I don't understand why the height isn't checked (shouldn't matter, zero loop iterations) */ > > movq rm, %mm4 /* move red (green & blue) (32 or 64 bits) into mm4 w/ zero extension to 128bits */ > movq gm, %mm5 /* FIXME: rm is 64 bits but needs treated as a 32 bit number (WRONG) */ > movq bm, %mm6 /* RGB's are 8 bit values. regardless of them coming in in 32/64 bit they are zero extended */ > > psllq $32, %mm4 /* Packed Shift Left Logical Quad words (left shift mm4 32bits twice, once for each 64bit value) */ > /* From A64_128bit_Media_Programming (p. 328) */ > psllq $16, %mm5 /* left shift mm5 16 bits twice, once for each 64bit value */ > /* the most significant 64 bits are zeros so they don't matter */ > > por %mm6, %mm4 /* mm4 |= mm6 */ > /* From A64_128bit_Media_Programming (p. 309) */ > por %mm5, %mm4 /* mm4 |= mm5 */ > /* mm4 now contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 00 : 00 0r 0g 0b color modifiers */ > > pcmpeqw %mm6, %mm6 /* Packed Compare Equal Words */ > /* From A64_128bit_Media_Programming (p. 276) */ > /* This sets mm6 to 128 1's (since mm6 = mm6) */ > > psllw $15, %mm6 /* 80 00 80 00 80 00 80 00 */ /* Packed Shift Left Logical Words */ > /* From A64_128bit_Media_Programming (p. 330) */ > /* This sets 8 16 bit values of 1000 0000 0000 0000 in the 128 bit word */ > > movq %mm6, %mm5 /* save the mm6 value in mm5 */ > > pmulhw %mm4, %mm5 /* Get correction factor */ /* Packed Multiply High Signed Word */ > /* mm4 = ( mm4 * mm5 ) >> 16 (8 times, once for each 16bit value) */ > /* For each (color modifier * 80 00) >> 16 = */ > /* (( cm << 15 ) >> 16 ) = cm >> 1 */ > /* bit shift each 16 bit color modifier 1 to the right and fill w/ ones */ >1: > movq %rbx, %rcx /* tweaked to handle 64 bit values */ > /* Load the counting register (rcx) with the width of the window to shade */ >2: > movd (%rsi, %rcx, 4), %mm1 /* 00 rr gg bb */ /* sets mm1 to the 32bit color in the image map (data) */ > /* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits */ > /* only move 32 bits with movd so we don't get two pixels worth of colors */ > > pxor %mm0, %mm0 /* 128bit exclusive or (sets mm0 to 0) */ > > punpcklbw %mm1, %mm0 /* 00 00 rr 00 gg 00 bb 00 */ /* Unpack and interleave low bytes */ > /* For each color of the pixel expand to 16 bits and shift left 8 bits */ > /* From A64_128bit_Media_Programming (p. 374) */ > /* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first) */ > > pxor %mm6, %mm0 /* Flip sign */ /* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times) */ > > pmulhw %mm4, %mm0 /* 00 00 xx rr xx gg xx bb */ /* Package Multiply High Signed Word (an SSE2 instruction) 128bit mm0=color mm4=cm */ > /* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0 */ > /* and the high 16 bits of the result replace the 16 bits used from mm0 */ > /* For (( each 16 bit color * each 16 bit color modifier ) >> 16 ) */ > > psubw %mm5, %mm0 /* Correct range */ /* Packed Subtract Words */ > /* From A64_128bit_Media_Programming (p. 364) */ > /* mm0=modified color mm5=corrected color modifier. mm0 = ( mm0 - mm5 ) */ > /* 16 bit corrected modified color = ( modified color - corrected color modifier ) */ > > packuswb %mm0, %mm0 /* 00 rr gg bb 00 rr gg bb */ /* Pack with Saturation Signed Word to Unsigned Byte */ > /* From A64_128bit_Media_Programming (p. 246) */ > /* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0 */ > > movd %mm0, (%rsi, %rcx, 4) /* puts the new 32 bit color value back into the data (image map) */ > /* 32 bit color is still a double word so movd stays movd */ > > incq %rcx /* Increment the count register (more pixels left) */ > jnz 2b /* Jump backwards to label 2 on NOT zero (more pixels left) */ > > addq bpl, %rsi /* Add bytes per line to the data pointer (advance the pointer to the next line) */ > decq %rdx /* Decrement the dx register (row count) */ > jnz 1b /* Jump backwards to label 1 if not zero (more rows left) */ >3: > LEAVE /* macro ending the function */ > >#endif /* HAVE_MMX_64 */ > > >#ifdef MY_CONCERNS >/* > * Concerns: > * The AMD books say that some of the ops are SSE2 but they are in a MMX set of routines. > * Verify that MMX = 64 bit extensions and SSE2 = 128 bit extensions. (TRUE) > * The book uses 'xmm1' SSE2 and 128 bit so maybe 'xm1' is MMX/64 bit (FALSE) > * The movd instructions for the pixel array work on 32 bits or a double word. > * Investigate the possibility of handling 2 pixels (64 bits) at a time on AMD64 with movq > * Do we need to use WORDS_BIGENDIAN to avoid the bad blue acid trip fixed in 0.9.3-r1? > * > */ >#endif > > >
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 84520
:
52947
|
52948
| 57157