/*
 * Copyright (C) 1997-2004, Michael Jennings
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies of the Software, its documentation and marketing & publicity
 * materials, and acknowledgment shall be given in the documentation, materials
 * and software packages that this Software was used.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
 * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

#include "config.h"

/* MMX routines for tinting XImages on AMD64 updated by Tres <tres@mindspring.com> */
/* MMX routines for tinting XImages written by Willem Monsuwe <willem@stack.nl>    */

/* AMD64 Function calling conventions:
 *   shade_ximage_xx_mmx(void *data, int bpl, int w, int h, int rm, int gm, int bm);
 *   shade_ximage_xx_mmx_64(void *data, int bpl, int w, int h, int rm, int gm, int bm);
 */

/* Manuals used in this port:
 *	The Gnu Assembler
 *		http://www.gnu.org/software/binutils/manual/gas-2.9.1/html_mono/as.html
 *	AMD64 Architecture Programmer's Manual Volume 1: Application Programming
 *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24592.pdf
 *	AMD64 Architecture Programmer's Manual Volume 2: System Programming
 *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
 *	AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions
 *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24594.pdf
 *	AMD64 Architecture Programmer's Manual Volume 4: 128-Bit Media Instructions
 *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26568.pdf
 *	AMD64 Architecture Programmer's Manual Volume 5: 64-Bit Media and x87 Floating-Point Instructions
 *		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26569.pdf
 */

#ifdef HAVE_MMX_64

#define data	16(%rbp)
#define bpl	24(%rbp)
#define w	32(%rbp)
#define h	40(%rbp)
#define rm	48(%rbp)
#define gm	56(%rbp)
#define bm	64(%rbp)

/*
.global shade_ximage_15_mmx_64
        .type shade_ximage_15_mmx_64,@function
.global shade_ximage_16_mmx_64
        .type shade_ximage_16_mmx_64,@function
*/
.global shade_ximage_32_mmx_64
        .type shade_ximage_32_mmx_64,@function

.bss
.text
.align 8

#define ENTER                   \
        pushq %rbp              ;\
        movq %rsp, %rbp         ;\
        pushq %rbx              ;\
        pushq %rcx              ;\
        pushq %rdx              ;\
        pushq %rdi              ;\
        pushq %rsi              ;\
        movq data, %rsi         ;\
        movq w, %rbx            ;\
        movq h, %rdx

#define LEAVE                   \
4:                              ;\
        emms                    ;\
        popq %rsi               ;\
        popq %rdi               ;\
        popq %rdx               ;\
        popq %rcx               ;\
        popq %rbx               ;\
        movq %rbp, %rsp         ;\
        popq %rbp               ;\
        ret

#if 0 /* This comments out everything from here down to 'shade_ximage_32_mmx_64()' */
	/* The AMD64 port is complete to here, and the last function  */
shade_ximage_15_mmx_64:
        ENTER

        leal -6(%esi, %ebx, 2), %esi
        negl %ebx
        jz 5f

        /* Setup multipliers */
        movd rm, %mm5
        movd gm, %mm6
        movd bm, %mm7
        punpcklwd %mm5, %mm5    /* 00 00 00 00 rm rm rm rm */
        punpcklwd %mm6, %mm6    /* 00 00 00 00 gm gm gm gm */
        punpcklwd %mm7, %mm7    /* 00 00 00 00 bm bm bm bm */
        punpckldq %mm5, %mm5    /* rm rm rm rm rm rm rm rm */
        punpckldq %mm6, %mm6    /* gm gm gm gm gm gm gm gm */
        punpckldq %mm7, %mm7    /* bm bm bm bm bm bm bm bm */

        cmpl $256, rm
        jg shade_ximage_15_mmx_saturate
        cmpl $256, gm
        jg shade_ximage_15_mmx_saturate
        cmpl $256, bm
        jg shade_ximage_15_mmx_saturate

1:      movl %ebx, %ecx
        addl $3, %ecx
        jns 3f
2:
        movq (%esi, %ecx, 2), %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $10, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $11, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $3, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* 00 0r */
        pmulhw %mm6, %mm1       /* 00 0g */
        pmulhw %mm7, %mm2       /* 00 0b */

        psllw $10, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */
        
        movq %mm0, (%esi, %ecx, 2)

        addl $4, %ecx
        js 2b
        jmp 4f
3:
        movw (%esi, %ecx, 2), %ax
        movd %eax, %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $10, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $11, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $3, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* 00 0r */
        pmulhw %mm6, %mm1       /* 00 0g */
        pmulhw %mm7, %mm2       /* 00 0b */

        psllw $10, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */

        movd %mm0, %eax
        movw %ax, (%esi, %ecx, 2)

        incl %ecx
4:
        cmpl $2, %ecx
        jng 3b

        addl bpl, %esi
        decl %edx
        jnz 1b
5:
        LEAVE


shade_ximage_15_mmx_saturate_64:

        pcmpeqw %mm3, %mm3
        psllw $5, %mm3          /* ff e0 ff e0 ff e0 ff e0 */

1:      movl %ebx, %ecx
        addl $3, %ecx
        jns 3f
2:
        movq (%esi, %ecx, 2), %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $10, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $11, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $3, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* xx xr */
        pmulhw %mm6, %mm1       /* xx xg */
        pmulhw %mm7, %mm2       /* xx xb */

        /* Saturate upper */
        paddusw %mm3, %mm0      /* ff er */
        paddusw %mm3, %mm1      /* ff eg */
        paddusw %mm3, %mm2      /* ff eb */

        psubw %mm3, %mm1        /* 00 0g */
        psubw %mm3, %mm2        /* 00 0b */
        
        psllw $10, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */

        movq %mm0, (%esi, %ecx, 2)

        addl $4, %ecx
        js 2b
        jmp 4f
3:
        movw (%esi, %ecx, 2), %ax
        movd %eax, %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $10, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $11, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $3, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* xx xr */
        pmulhw %mm6, %mm1       /* xx xg */
        pmulhw %mm7, %mm2       /* xx xb */

        /* Saturate upper */
        paddusw %mm3, %mm0      /* ff er */
        paddusw %mm3, %mm1      /* ff eg */
        paddusw %mm3, %mm2      /* ff eb */

        psubw %mm3, %mm1        /* 00 0g */
        psubw %mm3, %mm2        /* 00 0b */
        
        psllw $10, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */

        movd %mm0, %eax
        movw %ax, (%esi, %ecx, 2)

        incl %ecx
4:
        cmpl $2, %ecx
        jng 3b

        addl bpl, %esi
        decl %edx
        jnz 1b
5:
        LEAVE


shade_ximage_16_mmx_64:
        ENTER

        leal -6(%esi, %ebx, 2), %esi
        negl %ebx
        jz 5f

        /* Setup multipliers */
        movd rm, %mm5
        movd gm, %mm6
        movd bm, %mm7
        punpcklwd %mm5, %mm5    /* 00 00 00 00 rm rm rm rm */
        punpcklwd %mm6, %mm6    /* 00 00 00 00 gm gm gm gm */
        punpcklwd %mm7, %mm7    /* 00 00 00 00 bm bm bm bm */
        punpckldq %mm5, %mm5    /* rm rm rm rm rm rm rm rm */
        punpckldq %mm6, %mm6    /* gm gm gm gm gm gm gm gm */
        punpckldq %mm7, %mm7    /* bm bm bm bm bm bm bm bm */

        cmpl $256, rm
        jg shade_ximage_16_mmx_saturate
        cmpl $256, gm
        jg shade_ximage_16_mmx_saturate
        cmpl $256, bm
        jg shade_ximage_16_mmx_saturate

1:      movl %ebx, %ecx
        addl $3, %ecx
        jns 3f
2:
        movq (%esi, %ecx, 2), %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $11, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $10, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $2, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* 00 0r */
        pmulhw %mm6, %mm1       /* 00 0g */
        pmulhw %mm7, %mm2       /* 00 0b */

        psllw $11, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */
        
        movq %mm0, (%esi, %ecx, 2)

        addl $4, %ecx
        js 2b
	jmp 4f
3:
        movw (%esi, %ecx, 2), %ax
        movd %eax, %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $11, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $10, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $2, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* 00 0r */
        pmulhw %mm6, %mm1       /* 00 0g */
        pmulhw %mm7, %mm2       /* 00 0b */

        psllw $11, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */

        movd %mm0, %eax
        movw %ax, (%esi, %ecx, 2)

        incl %ecx
4:
        cmpl $2, %ecx
        jng 3b

        addl bpl, %esi
        decl %edx
        jnz 1b
5:
        LEAVE


shade_ximage_16_mmx_saturate_64:

        pcmpeqw %mm3, %mm3
        movq %mm3, %mm4
        psllw $5, %mm3          /* ff e0 ff e0 ff e0 ff e0 */
        psllw $6, %mm4          /* ff c0 ff c0 ff c0 ff c0 */

1:      movl %ebx, %ecx
        addl $3, %ecx
        jns 3f
2:
        movq (%esi, %ecx, 2), %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $11, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $10, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $2, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* xx xr */
        pmulhw %mm6, %mm1       /* xx xg */
        pmulhw %mm7, %mm2       /* xx xb */

        /* Saturate upper */
        paddusw %mm3, %mm0      /* ff er */
        paddusw %mm4, %mm1      /* ff cg */
        paddusw %mm3, %mm2      /* ff eb */

        psubw %mm4, %mm1        /* 00 0g */
        psubw %mm3, %mm2        /* 00 0b */
        
        psllw $11, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */

        movq %mm0, (%esi, %ecx, 2)

        addl $4, %ecx
        js 2b
        jmp 4f
3:
        movw (%esi, %ecx, 2), %ax
        movd %eax, %mm0

        movq %mm0, %mm1         /* rg gb */
        movq %mm0, %mm2         /* rg gb */
        psrlw $5, %mm1          /* 0r rg */
        psrlw $11, %mm0         /* 00 0r */
        psllw $11, %mm2         /* b0 00 */
        psllw $10, %mm1         /* g0 00 */
        psllw $8, %mm0          /* 0r 00 */
        psrlw $2, %mm1          /* 0g 00 */
        psrlw $3, %mm2          /* 0b 00 */

        pmulhw %mm5, %mm0       /* xx xr */
        pmulhw %mm6, %mm1       /* xx xg */
        pmulhw %mm7, %mm2       /* xx xb */

        /* Saturate upper */
        paddusw %mm3, %mm0      /* ff er */
        paddusw %mm4, %mm1      /* ff cg */
        paddusw %mm3, %mm2      /* ff eb */

        psubw %mm4, %mm1        /* 00 0g */
        psubw %mm3, %mm2        /* 00 0b */
        
        psllw $11, %mm0         /* r0 00 */
        psllw $5, %mm1          /* 0g g0 */
        por %mm2, %mm0          /* r0 0b */
        por %mm1, %mm0          /* rg gb */

        movd %mm0, %eax
        movw %ax, (%esi, %ecx, 2)

        incl %ecx
4:
        cmpl $2, %ecx
        jng 3b

        addl bpl, %esi
        decl %edx
        jnz 1b
5:
        LEAVE
#endif  /* The commented out section  */


shade_ximage_32_mmx_64:
        ENTER									/* rsi* = data, rbx = w, rdx = h                                                                 */

        leaq (%rsi, %rbx, 4), %rsi						/* From A64_General_Purpose_and_System_Instructions (p. 182)                                     */
										/* Intel syntax section:[base + index*scale + disp]  (used by AMD manuals)                       */
										/* AT&T  syntax section:disp(base, index, scale)     (used by gas/gcc)                           */
        									/* Load Effective Address of (rsi + (rbx * size)) into rsi                                       */
        									/* 32 bits per pixel means a multiplier of 4.                                                    */

        negq %rbx								/* two's compliment negation of rbx and sets the Zero Flag based on the results                  */
        									/* From A64_General_Purpose_and_System_Instructions (p. 212)                                     */

        jz 3f									/* Jump to label 3 forward on Zero                                                               */
        									/* Basically if width = 0 blowout                                                                */
        									/* I don't understand why the height isn't checked (shouldn't matter, zero loop iterations)      */

        movq rm, %mm4								/* move red (green & blue) (32 or 64 bits) into mm4 w/ zero extension to 128bits                 */
        movq gm, %mm5								/* FIXME: rm is 64 bits but needs treated as a 32 bit number (WRONG)                             */
        movq bm, %mm6								/* RGB's are 8 bit values. regardless of them coming in in 32/64 bit they are zero extended      */

        psllq $32, %mm4								/* Packed Shift Left Logical Quad words (left shift mm4 32bits twice, once for each 64bit value) */
        									/* From A64_128bit_Media_Programming (p. 328)                                                    */
        psllq $16, %mm5								/* left shift mm5 16 bits twice, once for each 64bit value                                       */
        									/* the most significant 64 bits are zeros so they don't matter                                   */

        por %mm6, %mm4								/* mm4 |= mm6                                                                                    */
        									/* From A64_128bit_Media_Programming (p. 309)                                                    */
        por %mm5, %mm4								/* mm4 |= mm5                                                                                    */
        									/* mm4 now contains 00 00 00 00 : 00 00 00 00 :: 00 00 00 00 : 00 0r 0g 0b color modifiers       */

        pcmpeqw %mm6, %mm6							/* Packed Compare Equal Words                                                                    */
        									/* From A64_128bit_Media_Programming (p. 276)                                                    */
        									/* This sets mm6 to 128 1's (since mm6 = mm6)                                                    */

        psllw $15, %mm6                 /* 80 00 80 00 80 00 80 00 */		/* Packed Shift Left Logical Words                                                               */
        									/* From A64_128bit_Media_Programming (p. 330)                                                    */
        									/* This sets 8 16 bit values of  1000 0000 0000 0000 in the 128 bit word                         */

        movq %mm6, %mm5								/* save the mm6 value in mm5                                                                     */

        pmulhw %mm4, %mm5               /* Get correction factor */		/* Packed Multiply High Signed Word                                                              */
        									/* mm4 = ( mm4 * mm5 ) >> 16  (8 times, once for each 16bit value)                               */
        									/* For each (color modifier * 80 00) >> 16 =                                                     */
        									/*   (( cm << 15 ) >> 16 )  = cm >> 1                                                            */
        									/*  bit shift each 16 bit color modifier 1 to the right and fill w/ ones                         */
1:
        movq %rbx, %rcx								/* tweaked to handle 64 bit values                                                               */
        									/* Load the counting register (rcx) with the width of the window to shade                        */
2:
        movd (%rsi, %rcx, 4), %mm1      /* 00 rr gg bb */			/* sets mm1 to the 32bit color in the image map (data)                                           */
        									/* 32 bit color is still 4 bytes so leave the multiplier alone it is zero extended to 128 bits   */
        									/* only move 32 bits with movd so we don't get two pixels worth of colors                        */

        pxor %mm0, %mm0								/* 128bit exclusive or (sets mm0 to 0)                                                           */

        punpcklbw %mm1, %mm0            /* 00 00 rr 00 gg 00 bb 00 */		/* Unpack and interleave low bytes                                                               */
        									/* For each color of the pixel expand to 16 bits and shift left 8 bits                           */
        									/* From A64_128bit_Media_Programming (p. 374)                                                    */
        									/* discard high 64 bits and expand both mm0 and mm1 a byte at a time into mm0 (mm0 first)        */

        pxor %mm6, %mm0                 /* Flip sign */				/* This flips the sign of the 16 bit red, green, and blue colors. (mm6 ~= 1000:0000 8 times)     */

        pmulhw %mm4, %mm0               /* 00 00 xx rr xx gg xx bb */		/* Package Multiply High Signed Word  (an SSE2 instruction) 128bit     mm0=color  mm4=cm         */
        									/* Each 16 bit signed int in mm4 (8) is multiplied by the same in mm0                            */
        									/*    and the high 16 bits of the result replace the 16 bits used from mm0                       */
        									/* For (( each 16 bit color * each 16 bit color modifier ) >> 16 )                               */

        psubw %mm5, %mm0                /* Correct range */			/* Packed Subtract Words                                                                         */
        									/* From A64_128bit_Media_Programming (p. 364)                                                    */
        									/* mm0=modified color  mm5=corrected color modifier. mm0 = ( mm0 - mm5 )                         */
        									/* 16 bit corrected modified color = ( modified color - corrected color modifier )               */

        packuswb %mm0, %mm0             /* 00 rr gg bb 00 rr gg bb */		/* Pack with Saturation Signed Word to Unsigned Byte                                             */
        									/* From A64_128bit_Media_Programming (p. 246)                                                    */
        									/* if mm0 > 255 then mm0=255 elsif mm0 < 0 mm0=0 else mm0=mm0                                    */

        movd %mm0, (%rsi, %rcx, 4)						/* puts the new 32 bit color value back into the data (image map)                                */
        									/* 32 bit color is still a double word so movd stays movd                                        */

        incq %rcx								/* Increment the count register (more pixels left)                                               */
        jnz 2b									/* Jump backwards to label 2 on NOT zero (more pixels left)                                      */

        addq bpl, %rsi								/* Add bytes per line to the data pointer (advance the pointer to the next line)                 */
        decq %rdx								/* Decrement the dx register (row count)                                                         */
        jnz 1b									/* Jump backwards to label 1 if not zero (more rows left)                                        */
3:
        LEAVE									/* macro ending the function                                                                     */

#endif /* HAVE_MMX_64 */


#ifdef MY_CONCERNS
/*
 * Concerns:
 *	The AMD books say that some of the ops are SSE2 but they are in a MMX set of routines.
 *		Verify that MMX = 64 bit extensions and SSE2 = 128 bit extensions. (TRUE)
 *		The book uses 'xmm1' SSE2 and 128 bit so maybe 'xm1' is MMX/64 bit (FALSE)
 *	The movd instructions for the pixel array work on 32 bits or a double word.
 *		Investigate the possibility of handling 2 pixels (64 bits) at a time on AMD64 with movq
 *	Do we need to use WORDS_BIGENDIAN to avoid the bad blue acid trip fixed in 0.9.3-r1?
 *
 */
#endif