Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 80685 | Differences between
and this patch

Collapse All | Expand All

(-)fbmmx.c.orig (-353 / +588 lines)
Lines 1-5 Link Here
1
/*
1
/*
2
 * Copyright © 2004 Red Hat, Inc.
2
 * Copyright © 2004 Red Hat, Inc.
3
 * Copyright © 2004 Nicholas Miell
3
 *
4
 *
4
 * Permission to use, copy, modify, distribute, and sell this software and its
5
 * Permission to use, copy, modify, distribute, and sell this software and its
5
 * documentation for any purpose is hereby granted without fee, provided that
6
 * documentation for any purpose is hereby granted without fee, provided that
Lines 18-31 Link Here
18
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 
19
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN 
19
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20
 *
21
 *
21
 * Author:  Søren Sandmann (sandmann@redhat.com)
22
 * Author:  Søren Sandmann (sandmann@redhat.com)
22
 * 
23
 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
24
 *
23
 * Based on work by Owen Taylor
25
 * Based on work by Owen Taylor
24
 */
26
 */
25
27
26
#include "fb.h"
27
28
28
#ifdef USE_GCC34_MMX
29
#ifdef USE_GCC34_MMX
30
#else
31
#error "Kala"
32
#include "fb.h"
33
#include "fbmmx.h"
34
35
#include <mmintrin.h>
36
37
#ifdef USE_SSE
38
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
39
#endif 
29
40
30
#ifdef RENDER
41
#ifdef RENDER
31
42
Lines 33-43 Link Here
33
#include "mipict.h"
44
#include "mipict.h"
34
#include "fbpict.h"
45
#include "fbpict.h"
35
46
36
typedef int Vector1x64 __attribute__ ((mode(DI)));
37
typedef int Vector2x32 __attribute__ ((mode(V2SI)));
38
typedef int Vector4x16 __attribute__ ((mode(V4HI)));
39
typedef int Vector8x8  __attribute__ ((mode(V8QI)));
40
41
typedef unsigned long long ullong;
47
typedef unsigned long long ullong;
42
48
43
#define noVERBOSE
49
#define noVERBOSE
Lines 50-56 Link Here
50
56
51
typedef struct
57
typedef struct
52
{
58
{
53
    ullong mmx_zero;
54
    ullong mmx_4x00ff;
59
    ullong mmx_4x00ff;
55
    ullong mmx_4x0080;
60
    ullong mmx_4x0080;
56
    ullong mmx_565_rgb;
61
    ullong mmx_565_rgb;
Lines 70-76 Link Here
70
75
71
static const MMXData c =
76
static const MMXData c =
72
{
77
{
73
    .mmx_zero =				0x0000000000000000ULL,
74
    .mmx_4x00ff =			0x00ff00ff00ff00ffULL,
78
    .mmx_4x00ff =			0x00ff00ff00ff00ffULL,
75
    .mmx_4x0080 =			0x0080008000800080ULL,
79
    .mmx_4x0080 =			0x0080008000800080ULL,
76
    .mmx_565_rgb =			0x000001f0003f001fULL,
80
    .mmx_565_rgb =			0x000001f0003f001fULL,
Lines 88-208 Link Here
88
    .mmx_000000000000ffff =		0x000000000000ffffULL,
92
    .mmx_000000000000ffff =		0x000000000000ffffULL,
89
};
93
};
90
94
91
static __inline__ Vector1x64
95
#define MC(x) ((__m64) c.mmx_##x)
92
shift (Vector1x64 v, int s)
96
97
static __inline__ __m64
98
shift (__m64 v, int s)
93
{
99
{
94
    if (s > 0)
100
    if (s > 0)
95
	return __builtin_ia32_psllq (v, s);
101
	return _mm_slli_si64 (v, s);
96
    else if (s < 0)
102
    else if (s < 0)
97
	return __builtin_ia32_psrlq (v, -s);
103
	return _mm_srli_si64 (v, -s);
98
    else
104
    else
99
	return v;
105
	return v;
100
}
106
}
101
107
102
static __inline__ Vector4x16
108
static __inline__ __m64
103
negate (Vector4x16 mask)
109
negate (__m64 mask)
104
{
110
{
105
    return (Vector4x16)__builtin_ia32_pxor (
111
    return _mm_xor_si64 (mask, MC(4x00ff));
106
	(Vector1x64)mask,
107
	(Vector1x64)c.mmx_4x00ff);
108
}
112
}
109
113
110
static __inline__ Vector4x16
114
static __inline__ __m64
111
pix_multiply (Vector4x16 a, Vector4x16 b)
115
pix_multiply (__m64 a, __m64 b)
112
{
116
{
113
    Vector4x16 res;
117
    __m64 res;
114
    
118
    
115
    res = __builtin_ia32_pmullw (a, b);
119
    res = _mm_mullo_pi16 (a, b);
116
    res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080);
120
    res = _mm_add_pi16 (res, MC(4x0080));
117
    res = __builtin_ia32_psrlw (res, 8);
121
    res = _mm_srli_pi16 (res, 8);
118
    
122
    
119
    return res;
123
    return res;
120
}
124
}
121
125
122
#if 0
126
#ifdef USE_SSE
123
#define HAVE_PSHUFW
127
#define HAVE_PSHUFW
124
#endif
128
#endif
125
129
126
#ifdef HAVE_PSHUFW
130
#ifdef HAVE_PSHUFW
127
131
128
static __inline__ Vector4x16
132
static __inline__ __m64
129
expand_alpha (Vector4x16 pixel)
133
expand_alpha (__m64 pixel)
130
{
134
{
131
    Vector4x16 result;
135
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
132
    __asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel));
133
    return result;
134
}
136
}
135
137
136
static __inline__ Vector4x16
138
static __inline__ __m64
137
expand_alpha_rev (Vector4x16 pixel)
139
expand_alpha_rev (__m64 pixel)
138
{
140
{
139
    Vector4x16 result;
141
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
140
    __asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel));
141
    return result;
142
}    
142
}    
143
143
144
static __inline__ Vector4x16
144
static __inline__ __m64
145
invert_colors (Vector4x16 pixel)
145
invert_colors (__m64 pixel)
146
{
146
{
147
    Vector4x16 result;
147
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
148
149
    /* 0xC6 = 11000110 */
150
    /*         3 0 1 2 */
151
    
152
    __asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel));
153
154
    return result;
155
}
148
}
156
149
157
#else
150
#else
158
151
159
static __inline__ Vector4x16
152
static __inline__ __m64
160
expand_alpha (Vector4x16 pixel)
153
expand_alpha (__m64 pixel)
161
{
154
{
162
    Vector1x64 t1, t2;
155
    __m64 t1, t2;
163
156
    
164
    t1 = shift ((Vector1x64)pixel, -48);
157
    t1 = shift (pixel, -48);
165
    t2 = shift (t1, 16);
158
    t2 = shift (t1, 16);
166
    t1 = __builtin_ia32_por (t1, t2);
159
    t1 = _mm_or_si64 (t1, t2);
167
    t2 = shift (t1, 32);
160
    t2 = shift (t1, 32);
168
    t1 = __builtin_ia32_por (t1, t2);
161
    t1 = _mm_or_si64 (t1, t2);
169
162
    
170
    return (Vector4x16)t1;
163
    return t1;
171
}
164
}
172
165
173
static __inline__ Vector4x16
166
static __inline__ __m64
174
expand_alpha_rev (Vector4x16 pixel)
167
expand_alpha_rev (__m64 pixel)
175
{
168
{
176
    Vector1x64 t1, t2;
169
    __m64 t1, t2;
177
170
    
178
    t1 = shift ((Vector1x64)pixel,  48);
171
    /* move alpha to low 16 bits and zero the rest */
172
    t1 = shift (pixel,  48);
179
    t1 = shift (t1, -48);
173
    t1 = shift (t1, -48);
174
    
180
    t2 = shift (t1, 16);
175
    t2 = shift (t1, 16);
181
    t1 = __builtin_ia32_por (t1, t2);
176
    t1 = _mm_or_si64 (t1, t2);
182
    t2 = shift (t1, 32);
177
    t2 = shift (t1, 32);
183
    t1 = __builtin_ia32_por (t1, t2);
178
    t1 = _mm_or_si64 (t1, t2);
184
179
    
185
    return (Vector4x16)t1;
180
    return t1;
186
}
181
}
187
182
188
static __inline__ Vector4x16
183
static __inline__ __m64
189
invert_colors (Vector4x16 pixel)
184
invert_colors (__m64 pixel)
190
{
185
{
191
    Vector1x64 x, y, z;
186
    __m64 x, y, z;
192
187
    
193
    x = y = z = (Vector1x64)pixel;
188
    x = y = z = pixel;
194
189
    
195
    x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000);
190
    x = _mm_and_si64 (x, MC(ffff0000ffff0000));
196
    y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff);
191
    y = _mm_and_si64 (y, MC(000000000000ffff));
197
    z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000);
192
    z = _mm_and_si64 (z, MC(0000ffff00000000));
198
193
    
199
    y = shift (y, 32);
194
    y = shift (y, 32);
200
    z = shift (z, -32);
195
    z = shift (z, -32);
201
196
    
202
    x = __builtin_ia32_por (x, y);
197
    x = _mm_or_si64 (x, y);
203
    x = __builtin_ia32_por (x, z);
198
    x = _mm_or_si64 (x, z);
204
199
    
205
    return (Vector4x16)x;
200
    return x;
206
}
201
}
207
202
208
#endif
203
#endif
Lines 210-356 Link Here
210
/* Notes about writing mmx code
205
/* Notes about writing mmx code
211
 *
206
 *
212
 * give memory operands as the second operand. If you give it as the
207
 * give memory operands as the second operand. If you give it as the
213
 * first, gcc will first load it into a register, then use that register
208
 * first, gcc will first load it into a register, then use that
209
 * register
214
 *
210
 *
215
 *   ie. use
211
 *   ie. use
216
 *
212
 *
217
 *         __builtin_pmullw (x, mmx_constant[8]);
213
 *         _mm_mullo_pi16 (x, mmx_constant);
218
 *
214
 *
219
 *   not
215
 *   not
220
 *
216
 *
221
 *         __builtin_pmullw (mmx_constant[8], x);
217
 *         _mm_mullo_pi16 (mmx_constant, x);
222
 *
218
 *
223
 * Also try to minimize dependencies. Ie. when you need a value, try to calculate
219
 * Also try to minimize dependencies. i.e. when you need a value, try
224
 * it from a value that was calculated as early as possible.
220
 * to calculate it from a value that was calculated as early as
221
 * possible.
225
 */
222
 */
226
223
227
static __inline__ Vector4x16
224
static __inline__ __m64
228
over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest)
225
over (__m64 src, __m64 srca, __m64 dest)
229
{
226
{
230
    return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca)));
227
    return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
231
}
228
}
232
229
233
static __inline__ Vector4x16
230
static __inline__ __m64
234
over_rev_non_pre (Vector4x16 src, Vector4x16 dest)
231
over_rev_non_pre (__m64 src, __m64 dest)
235
{
232
{
236
    Vector4x16 srca = expand_alpha (src);
233
    __m64 srca = expand_alpha (src);
237
    Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha);
234
    __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
238
235
    
239
    return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
236
    return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
240
}
237
}
241
238
242
static __inline__ Vector4x16
239
static __inline__ __m64
243
in (Vector4x16 src,
240
in (__m64 src,
244
    Vector4x16 mask)
241
    __m64 mask)
245
{
242
{
246
    return pix_multiply (src, mask);
243
    return pix_multiply (src, mask);
247
}
244
}
248
245
249
static __inline__ Vector4x16
246
static __inline__ __m64
250
in_over (Vector4x16 src,
247
in_over (__m64 src,
251
	 Vector4x16 srca,
248
	 __m64 srca,
252
	 Vector4x16 mask,
249
	 __m64 mask,
253
	 Vector4x16 dest)
250
	 __m64 dest)
254
{
251
{
255
    return over(in(src, mask), pix_multiply(srca, mask), dest);
252
    return over(in(src, mask), pix_multiply(srca, mask), dest);
256
}
253
}
257
254
258
static __inline__ Vector8x8
255
static __inline__ __m64
259
cvt32to64 (CARD32 v)
260
{
261
    ullong r = v;
262
    return (Vector8x8)r;
263
}
264
265
static __inline__ Vector4x16
266
load8888 (CARD32 v)
256
load8888 (CARD32 v)
267
{
257
{
268
    return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v),
258
    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
269
						 (Vector8x8)c.mmx_zero);
270
}
259
}
271
260
272
static __inline__ Vector8x8
261
static __inline__ __m64
273
pack8888 (Vector4x16 lo, Vector4x16 hi)
262
pack8888 (__m64 lo, __m64 hi)
274
{
263
{
275
    Vector8x8 r;
264
    __m64 r;
276
    r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi);
265
    r = _mm_packs_pu16 (lo, hi);
277
    return r;
266
    return r;
278
}
267
}
279
268
280
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB
269
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
281
   
270
 *
282
--- Expanding 565 in the low word ---
271
 *    00RR00GG00BB
283
272
 * 
284
m = (m << (32 - 3)) | (m << (16 - 5)) | m;
273
 * --- Expanding 565 in the low word ---
285
m = m & (01f0003f001f);
274
 * 
286
m = m * (008404100840);
275
 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
287
m = m >> 8;
276
 * m = m & (01f0003f001f);
288
277
 * m = m * (008404100840);
289
Note the trick here - the top word is shifted by another nibble to avoid
278
 * m = m >> 8;
290
it bumping into the middle word
279
 * 
291
*/
280
 * Note the trick here - the top word is shifted by another nibble to
292
static __inline__ Vector4x16
281
 * avoid it bumping into the middle word
293
expand565 (Vector4x16 pixel, int pos)
282
 */
283
static __inline__ __m64
284
expand565 (__m64 pixel, int pos)
294
{
285
{
295
    Vector1x64 p = (Vector1x64)pixel;
286
    __m64 p = pixel;
287
    __m64 t1, t2;
296
    
288
    
297
    /* move pixel to low 16 bit and zero the rest */
289
    /* move pixel to low 16 bit and zero the rest */
298
    p = shift (shift (p, (3 - pos) * 16), -48); 
290
    p = shift (shift (p, (3 - pos) * 16), -48); 
299
    
291
    
300
    Vector1x64 t1 = shift (p, 36 - 11);
292
    t1 = shift (p, 36 - 11);
301
    Vector1x64 t2 = shift (p, 16 - 5);
293
    t2 = shift (p, 16 - 5);
302
    
294
    
303
    p = __builtin_ia32_por (t1, p);
295
    p = _mm_or_si64 (t1, p);
304
    p = __builtin_ia32_por (t2, p);
296
    p = _mm_or_si64 (t2, p);
305
    p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb);
297
    p = _mm_and_si64 (p, MC(565_rgb));
306
    
298
    
307
    pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier);
299
    pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
308
    return __builtin_ia32_psrlw (pixel, 8);
300
    return _mm_srli_pi16 (pixel, 8);
309
}
301
}
310
302
311
static __inline__ Vector4x16
303
static __inline__ __m64
312
expand8888 (Vector4x16 in, int pos)
304
expand8888 (__m64 in, int pos)
313
{
305
{
314
    if (pos == 0)
306
    if (pos == 0)
315
	return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
307
	return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
316
    else
308
    else
317
	return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero);
309
	return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
318
}
310
}
319
311
320
static __inline__ Vector4x16
312
static __inline__ __m64
321
pack565 (Vector4x16 pixel, Vector4x16 target, int pos)
313
pack565 (__m64 pixel, __m64 target, int pos)
322
{
314
{
323
    Vector1x64 p = (Vector1x64)pixel;
315
    __m64 p = pixel;
324
    Vector1x64 t = (Vector1x64)target;
316
    __m64 t = target;
325
    Vector1x64 r, g, b;
317
    __m64 r, g, b;
326
    
318
    
327
    r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r);
319
    r = _mm_and_si64 (p, MC(565_r));
328
    g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g);
320
    g = _mm_and_si64 (p, MC(565_g));
329
    b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b);
321
    b = _mm_and_si64 (p, MC(565_b));
330
    
322
    
331
    r = shift (r, - (32 - 8) + pos * 16);
323
    r = shift (r, - (32 - 8) + pos * 16);
332
    g = shift (g, - (16 - 3) + pos * 16);
324
    g = shift (g, - (16 - 3) + pos * 16);
333
    b = shift (b, - (0  + 3) + pos * 16);
325
    b = shift (b, - (0  + 3) + pos * 16);
334
326
    
335
    if (pos == 0)
327
    if (pos == 0)
336
	t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0);
328
	t = _mm_and_si64 (t, MC(mask_0));
337
    else if (pos == 1)
329
    else if (pos == 1)
338
	t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1);
330
	t = _mm_and_si64 (t, MC(mask_1));
339
    else if (pos == 2)
331
    else if (pos == 2)
340
	t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2);
332
	t = _mm_and_si64 (t, MC(mask_2));
341
    else if (pos == 3)
333
    else if (pos == 3)
342
	t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3);
334
	t = _mm_and_si64 (t, MC(mask_3));
343
    
335
    
344
    p = __builtin_ia32_por (r, t);
336
    p = _mm_or_si64 (r, t);
345
    p = __builtin_ia32_por (g, p);
337
    p = _mm_or_si64 (g, p);
346
    
338
    
347
    return (Vector4x16)__builtin_ia32_por (b, p);
339
    return _mm_or_si64 (b, p);
348
}
349
350
static __inline__ void
351
emms (void)
352
{
353
    __asm__ __volatile__ ("emms");
354
}
340
}
355
341
356
void
342
void
Lines 371-378 Link Here
371
    CARD32	*dstLine, *dst;
357
    CARD32	*dstLine, *dst;
372
    CARD16	w;
358
    CARD16	w;
373
    FbStride	dstStride;
359
    FbStride	dstStride;
374
    Vector4x16	vsrc, vsrca;
360
    __m64	vsrc, vsrca;
375
361
    
376
    CHECKPOINT();
362
    CHECKPOINT();
377
    
363
    
378
    fbComposeGetSolid(pSrc, src, pDst->format);
364
    fbComposeGetSolid(pSrc, src, pDst->format);
Lines 384-434 Link Here
384
    
370
    
385
    vsrc = load8888 (src);
371
    vsrc = load8888 (src);
386
    vsrca = expand_alpha (vsrc);
372
    vsrca = expand_alpha (vsrc);
387
373
    
388
    while (height--)
374
    while (height--)
389
    {
375
    {
390
	dst = dstLine;
376
	dst = dstLine;
391
	dstLine += dstStride;
377
	dstLine += dstStride;
392
	w = width;
378
	w = width;
393
379
	
394
	CHECKPOINT();
380
	CHECKPOINT();
395
	
381
	
396
	while (w && (unsigned long)dst & 7)
382
	while (w && (unsigned long)dst & 7)
397
	{
383
	{
398
	    *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
384
	    *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)),
385
				     _mm_setzero_si64());
399
	    
386
	    
400
	    w--;
387
	    w--;
401
	    dst++;
388
	    dst++;
402
	}
389
	}
403
390
	
404
	while (w >= 2)
391
	while (w >= 2)
405
	{
392
	{
406
	    Vector4x16 vdest;
393
	    __m64 vdest;
407
	    Vector4x16 dest0, dest1;
394
	    __m64 dest0, dest1;
408
395
	    
409
	    vdest = *(Vector4x16 *)dst;
396
	    vdest = *(__m64 *)dst;
410
	    
397
	    
411
	    dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
398
	    dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
412
	    dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
399
	    dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
413
	    
400
	    
414
	    *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
401
	    *(__m64 *)dst = pack8888(dest0, dest1);
415
	    
402
	    
416
	    dst += 2;
403
	    dst += 2;
417
	    w -= 2;
404
	    w -= 2;
418
	}
405
	}
419
406
	
420
	CHECKPOINT();
407
	CHECKPOINT();
421
	
408
	
422
	while (w)
409
	while (w)
423
	{
410
	{
424
	    *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero);
411
	    *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64());
425
	    
412
	    
426
	    w--;
413
	    w--;
427
	    dst++;
414
	    dst++;
428
	}
415
	}
429
    }
416
    }
430
    
417
    
431
    emms();
418
    _mm_empty();
432
}
419
}
433
420
434
void
421
void
Lines 449-456 Link Here
449
    CARD16	*dstLine, *dst;
436
    CARD16	*dstLine, *dst;
450
    CARD16	w;
437
    CARD16	w;
451
    FbStride	dstStride;
438
    FbStride	dstStride;
452
    Vector4x16	vsrc, vsrca;
439
    __m64	vsrc, vsrca;
453
440
    
454
    CHECKPOINT();
441
    CHECKPOINT();
455
    
442
    
456
    fbComposeGetSolid(pSrc, src, pDst->format);
443
    fbComposeGetSolid(pSrc, src, pDst->format);
Lines 462-510 Link Here
462
    
449
    
463
    vsrc = load8888 (src);
450
    vsrc = load8888 (src);
464
    vsrca = expand_alpha (vsrc);
451
    vsrca = expand_alpha (vsrc);
465
452
    
466
    while (height--)
453
    while (height--)
467
    {
454
    {
468
	dst = dstLine;
455
	dst = dstLine;
469
	dstLine += dstStride;
456
	dstLine += dstStride;
470
	w = width;
457
	w = width;
471
458
	
472
	CHECKPOINT();
459
	CHECKPOINT();
473
	
460
	
474
	while (w && (unsigned long)dst & 7)
461
	while (w && (unsigned long)dst & 7)
475
	{
462
	{
476
	    ullong d = *dst;
463
	    ullong d = *dst;
477
	    Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
464
	    __m64 vdest = expand565 ((__m64)d, 0);
478
	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
465
	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
479
	    *dst = (ullong)vdest;
466
	    *dst = (ullong)vdest;
480
	    
467
	    
481
	    w--;
468
	    w--;
482
	    dst++;
469
	    dst++;
483
	}
470
	}
484
471
	
485
	while (w >= 4)
472
	while (w >= 4)
486
	{
473
	{
487
	    Vector4x16 vdest;
474
	    __m64 vdest;
488
475
	    
489
	    vdest = *(Vector4x16 *)dst;
476
	    vdest = *(__m64 *)dst;
490
	    
477
	    
491
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
478
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
492
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
479
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
493
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
480
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
494
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
481
	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
495
	    
482
	    
496
	    *(Vector8x8 *)dst = (Vector8x8)vdest;
483
	    *(__m64 *)dst = vdest;
497
	    
484
	    
498
	    dst += 4;
485
	    dst += 4;
499
	    w -= 4;
486
	    w -= 4;
500
	}
487
	}
501
488
	
502
	CHECKPOINT();
489
	CHECKPOINT();
503
	
490
	
504
	while (w)
491
	while (w)
505
	{
492
	{
506
	    ullong d = *dst;
493
	    ullong d = *dst;
507
	    Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
494
	    __m64 vdest = expand565 ((__m64)d, 0);
508
	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
495
	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
509
	    *dst = (ullong)vdest;
496
	    *dst = (ullong)vdest;
510
	    
497
	    
Lines 513-519 Link Here
513
	}
500
	}
514
    }
501
    }
515
    
502
    
516
    emms();
503
    _mm_empty();
517
}
504
}
518
505
519
void
506
void
Lines 534-541 Link Here
534
    CARD32	*dstLine;
521
    CARD32	*dstLine;
535
    CARD32	*maskLine;
522
    CARD32	*maskLine;
536
    FbStride	dstStride, maskStride;
523
    FbStride	dstStride, maskStride;
537
    Vector4x16	vsrc, vsrca;
524
    __m64	vsrc, vsrca;
538
525
    
539
    CHECKPOINT();
526
    CHECKPOINT();
540
    
527
    
541
    fbComposeGetSolid(pSrc, src, pDst->format);
528
    fbComposeGetSolid(pSrc, src, pDst->format);
Lines 562-570 Link Here
562
	    
549
	    
563
	    if (m)
550
	    if (m)
564
	    {
551
	    {
565
		Vector4x16 vdest = load8888(*q);
552
		__m64 vdest = load8888(*q);
566
		vdest = in_over(vsrc, vsrca, load8888(m), vdest);
553
		vdest = in_over(vsrc, vsrca, load8888(m), vdest);
567
		*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
554
		*q = (ullong)pack8888(vdest, _mm_setzero_si64());
568
	    }
555
	    }
569
	    
556
	    
570
	    twidth--;
557
	    twidth--;
Lines 580-594 Link Here
580
	    
567
	    
581
	    if (m0 | m1)
568
	    if (m0 | m1)
582
	    {
569
	    {
583
		Vector4x16 dest0, dest1;
570
		__m64 dest0, dest1;
584
		Vector4x16 vdest = *(Vector4x16 *)q;
571
		__m64 vdest = *(__m64 *)q;
585
		
572
		
586
		dest0 = in_over(vsrc, vsrca, load8888(m0),
573
		dest0 = in_over(vsrc, vsrca, load8888(m0),
587
				expand8888 (vdest, 0));
574
				expand8888 (vdest, 0));
588
		dest1 = in_over(vsrc, vsrca, load8888(m1),
575
		dest1 = in_over(vsrc, vsrca, load8888(m1),
589
				expand8888 (vdest, 1));
576
				expand8888 (vdest, 1));
590
		
577
		
591
		*(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1);
578
		*(__m64 *)q = pack8888(dest0, dest1);
592
	    }
579
	    }
593
	    
580
	    
594
	    p += 2;
581
	    p += 2;
Lines 602-610 Link Here
602
	    
589
	    
603
	    if (m)
590
	    if (m)
604
	    {
591
	    {
605
		Vector4x16 vdest = load8888(*q);
592
		__m64 vdest = load8888(*q);
606
		vdest = in_over(vsrc, vsrca, load8888(m), vdest);
593
		vdest = in_over(vsrc, vsrca, load8888(m), vdest);
607
		*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
594
		*q = (ullong)pack8888(vdest, _mm_setzero_si64());
608
	    }
595
	    }
609
	    
596
	    
610
	    twidth--;
597
	    twidth--;
Lines 616-622 Link Here
616
	maskLine += maskStride;
603
	maskLine += maskStride;
617
    }
604
    }
618
    
605
    
619
    emms();
606
    _mm_empty();
607
}
608
609
void
610
fbCompositeSrc_8888x8x8888mmx (CARD8	op,
611
			       PicturePtr pSrc,
612
			       PicturePtr pMask,
613
			       PicturePtr pDst,
614
			       INT16	xSrc,
615
			       INT16	ySrc,
616
			       INT16      xMask,
617
			       INT16      yMask,
618
			       INT16      xDst,
619
			       INT16      yDst,
620
			       CARD16     width,
621
			       CARD16     height)
622
{
623
    CARD32	*dstLine, *dst;
624
    CARD32	*srcLine, *src;
625
    CARD8	*maskLine;
626
    CARD32	mask;
627
    __m64	vmask;
628
    FbStride	dstStride, srcStride, maskStride;
629
    CARD16	w;
630
    __m64  srca;
631
    
632
    CHECKPOINT();
633
    
634
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
635
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
636
    fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
637
638
    mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
639
    vmask = load8888 (mask);
640
    srca = MC(4x00ff);
641
    
642
    while (height--)
643
    {
644
	dst = dstLine;
645
	dstLine += dstStride;
646
	src = srcLine;
647
	srcLine += srcStride;
648
	w = width;
649
650
	while (w && (unsigned long)dst & 7)
651
	{
652
	    __m64 s = load8888 (*src);
653
	    __m64 d = load8888 (*dst);
654
	    
655
	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
656
	    
657
	    w--;
658
	    dst++;
659
	    src++;
660
	}
661
662
	while (w >= 16)
663
	{
664
	    __m64 vd0 = *(__m64 *)(dst + 0);
665
	    __m64 vd1 = *(__m64 *)(dst + 2);
666
	    __m64 vd2 = *(__m64 *)(dst + 4);
667
	    __m64 vd3 = *(__m64 *)(dst + 6);
668
	    __m64 vd4 = *(__m64 *)(dst + 8);
669
	    __m64 vd5 = *(__m64 *)(dst + 10);
670
	    __m64 vd6 = *(__m64 *)(dst + 12);
671
	    __m64 vd7 = *(__m64 *)(dst + 14);
672
673
	    __m64 vs0 = *(__m64 *)(src + 0);
674
	    __m64 vs1 = *(__m64 *)(src + 2);
675
	    __m64 vs2 = *(__m64 *)(src + 4);
676
	    __m64 vs3 = *(__m64 *)(src + 6);
677
	    __m64 vs4 = *(__m64 *)(src + 8);
678
	    __m64 vs5 = *(__m64 *)(src + 10);
679
	    __m64 vs6 = *(__m64 *)(src + 12);
680
	    __m64 vs7 = *(__m64 *)(dst + 14);
681
682
	    vd0 = (__m64)pack8888 (
683
		in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
684
		in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
685
	
686
	    vd1 = (__m64)pack8888 (
687
		in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
688
		in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
689
	
690
	    vd2 = (__m64)pack8888 (
691
		in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
692
		in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
693
	
694
	    vd3 = (__m64)pack8888 (
695
		in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
696
		in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
697
	
698
	    vd4 = (__m64)pack8888 (
699
		in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
700
		in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
701
	
702
	    vd5 = (__m64)pack8888 (
703
		in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
704
		in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
705
	
706
	    vd6 = (__m64)pack8888 (
707
		in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
708
		in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
709
	
710
	    vd7 = (__m64)pack8888 (
711
		in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
712
		in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
713
	
714
	    w -= 16;
715
	    dst += 16;
716
	    src += 16;
717
	}
718
	
719
	while (w)
720
	{
721
	    __m64 s = load8888 (*src);
722
	    __m64 d = load8888 (*dst);
723
	    
724
	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
725
	    
726
	    w--;
727
	    dst++;
728
	    src++;
729
	}
730
    }
731
732
    _mm_empty(); 
620
}
733
}
621
734
622
void
735
void
Lines 638-644 Link Here
638
    CARD8	*maskLine, *mask;
751
    CARD8	*maskLine, *mask;
639
    FbStride	dstStride, maskStride;
752
    FbStride	dstStride, maskStride;
640
    CARD16	w;
753
    CARD16	w;
641
    Vector4x16	vsrc, vsrca;
754
    __m64	vsrc, vsrca;
642
    ullong	srcsrc;
755
    ullong	srcsrc;
643
    
756
    
644
    CHECKPOINT();
757
    CHECKPOINT();
Lines 648-654 Link Here
648
    srca = src >> 24;
761
    srca = src >> 24;
649
    if (srca == 0)
762
    if (srca == 0)
650
	return;
763
	return;
651
764
    
652
    srcsrc = (unsigned long long)src << 32 | src;
765
    srcsrc = (unsigned long long)src << 32 | src;
653
    
766
    
654
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
767
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
Lines 664-670 Link Here
664
	mask = maskLine;
777
	mask = maskLine;
665
	maskLine += maskStride;
778
	maskLine += maskStride;
666
	w = width;
779
	w = width;
667
780
	
668
	CHECKPOINT();
781
	CHECKPOINT();
669
	
782
	
670
	while (w && (unsigned long)dst & 7)
783
	while (w && (unsigned long)dst & 7)
Lines 673-687 Link Here
673
	    
786
	    
674
	    if (m)
787
	    if (m)
675
	    {
788
	    {
676
		Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst));
789
		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
677
		*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
790
		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
678
	    }
791
	    }
679
	    
792
	    
680
	    w--;
793
	    w--;
681
	    mask++;
794
	    mask++;
682
	    dst++;
795
	    dst++;
683
	}
796
	}
684
797
	
685
	CHECKPOINT();
798
	CHECKPOINT();
686
	
799
	
687
	while (w >= 2)
800
	while (w >= 2)
Lines 689-717 Link Here
689
	    ullong m0, m1;
802
	    ullong m0, m1;
690
	    m0 = *mask;
803
	    m0 = *mask;
691
	    m1 = *(mask + 1);
804
	    m1 = *(mask + 1);
692
805
	    
693
	    if (srca == 0xff && (m0 & m1) == 0xff)
806
	    if (srca == 0xff && (m0 & m1) == 0xff)
694
	    {
807
	    {
695
		*(unsigned long long *)dst = srcsrc;
808
		*(unsigned long long *)dst = srcsrc;
696
	    }
809
	    }
697
	    else if (m0 | m1)
810
	    else if (m0 | m1)
698
	    {
811
	    {
699
		Vector4x16 vdest;
812
		__m64 vdest;
700
		Vector4x16 dest0, dest1;
813
		__m64 dest0, dest1;
701
814
		
702
		vdest = *(Vector4x16 *)dst;
815
		vdest = *(__m64 *)dst;
703
		
816
		
704
		dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0));
817
		dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
705
		dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1));
818
		dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
706
		
819
		
707
		*(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1);
820
		*(__m64 *)dst = pack8888(dest0, dest1);
708
	    }
821
	    }
709
	    
822
	    
710
	    mask += 2;
823
	    mask += 2;
711
	    dst += 2;
824
	    dst += 2;
712
	    w -= 2;
825
	    w -= 2;
713
	}
826
	}
714
827
	
715
	CHECKPOINT();
828
	CHECKPOINT();
716
	
829
	
717
	while (w)
830
	while (w)
Lines 720-728 Link Here
720
	    
833
	    
721
	    if (m)
834
	    if (m)
722
	    {
835
	    {
723
		Vector4x16 vdest = load8888(*dst);
836
		__m64 vdest = load8888(*dst);
724
		vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest);
837
		vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
725
		*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero);
838
		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
726
	    }
839
	    }
727
	    
840
	    
728
	    w--;
841
	    w--;
Lines 731-737 Link Here
731
	}
844
	}
732
    }
845
    }
733
    
846
    
734
    emms();
847
    _mm_empty();
735
}
848
}
736
849
737
850
Lines 754-760 Link Here
754
    CARD8	*maskLine, *mask;
867
    CARD8	*maskLine, *mask;
755
    FbStride	dstStride, maskStride;
868
    FbStride	dstStride, maskStride;
756
    CARD16	w;
869
    CARD16	w;
757
    Vector4x16	vsrc, vsrca;
870
    __m64	vsrc, vsrca;
758
    unsigned long long srcsrcsrcsrc, src16;
871
    unsigned long long srcsrcsrcsrc, src16;
759
    
872
    
760
    CHECKPOINT();
873
    CHECKPOINT();
Lines 770-778 Link Here
770
    
883
    
771
    vsrc = load8888 (src);
884
    vsrc = load8888 (src);
772
    vsrca = expand_alpha (vsrc);
885
    vsrca = expand_alpha (vsrc);
773
886
    
774
    src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0);
887
    src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
775
888
    
776
    srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
889
    srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
777
	(ullong)src16 << 16 | (ullong)src16;
890
	(ullong)src16 << 16 | (ullong)src16;
778
    
891
    
Lines 783-789 Link Here
783
	mask = maskLine;
896
	mask = maskLine;
784
	maskLine += maskStride;
897
	maskLine += maskStride;
785
	w = width;
898
	w = width;
786
899
	
787
	CHECKPOINT();
900
	CHECKPOINT();
788
	
901
	
789
	while (w && (unsigned long)dst & 7)
902
	while (w && (unsigned long)dst & 7)
Lines 793-808 Link Here
793
	    if (m)
906
	    if (m)
794
	    {
907
	    {
795
		ullong d = *dst;
908
		ullong d = *dst;
796
		Vector4x16 vd = (Vector4x16)d;
909
		__m64 vd = (__m64)d;
797
		Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
910
		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
798
		*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
911
		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
799
	    }
912
	    }
800
	    
913
	    
801
	    w--;
914
	    w--;
802
	    mask++;
915
	    mask++;
803
	    dst++;
916
	    dst++;
804
	}
917
	}
805
918
	
806
	CHECKPOINT();
919
	CHECKPOINT();
807
	
920
	
808
	while (w >= 4)
921
	while (w >= 4)
Lines 812-846 Link Here
812
	    m1 = *(mask + 1);
925
	    m1 = *(mask + 1);
813
	    m2 = *(mask + 2);
926
	    m2 = *(mask + 2);
814
	    m3 = *(mask + 3);
927
	    m3 = *(mask + 3);
815
928
	    
816
	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
929
	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
817
	    {
930
	    {
818
		*(unsigned long long *)dst = srcsrcsrcsrc;
931
		*(unsigned long long *)dst = srcsrcsrcsrc;
819
	    }
932
	    }
820
	    else if (m0 | m1 | m2 | m3)
933
	    else if (m0 | m1 | m2 | m3)
821
	    {
934
	    {
822
		Vector4x16 vdest;
935
		__m64 vdest;
823
		Vector4x16 vm0, vm1, vm2, vm3;
936
		__m64 vm0, vm1, vm2, vm3;
824
937
		
825
		vdest = *(Vector4x16 *)dst;
938
		vdest = *(__m64 *)dst;
826
939
		
827
		vm0 = (Vector4x16)m0;
940
		vm0 = (__m64)m0;
828
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
941
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
829
		vm1 = (Vector4x16)m1;
942
		vm1 = (__m64)m1;
830
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
943
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
831
		vm2 = (Vector4x16)m2;
944
		vm2 = (__m64)m2;
832
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
945
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
833
		vm3 = (Vector4x16)m3;
946
		vm3 = (__m64)m3;
834
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
947
		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
835
		
948
		
836
		*(Vector4x16 *)dst = vdest;
949
		*(__m64 *)dst = vdest;
837
	    }
950
	    }
838
	    
951
	    
839
	    w -= 4;
952
	    w -= 4;
840
	    mask += 4;
953
	    mask += 4;
841
	    dst += 4;
954
	    dst += 4;
842
	}
955
	}
843
956
	
844
	CHECKPOINT();
957
	CHECKPOINT();
845
	
958
	
846
	while (w)
959
	while (w)
Lines 850-858 Link Here
850
	    if (m)
963
	    if (m)
851
	    {
964
	    {
852
		ullong d = *dst;
965
		ullong d = *dst;
853
		Vector4x16 vd = (Vector4x16)d;
966
		__m64 vd = (__m64)d;
854
		Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0));
967
		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
855
		*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0);
968
		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
856
	    }
969
	    }
857
	    
970
	    
858
	    w--;
971
	    w--;
Lines 861-867 Link Here
861
	}
974
	}
862
    }
975
    }
863
    
976
    
864
    emms();
977
    _mm_empty();
865
}
978
}
866
979
867
void
980
void
Lines 887-895 Link Here
887
    
1000
    
888
    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
1001
    fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
889
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1002
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
890
1003
    
891
    assert (pSrc->pDrawable == pMask->pDrawable);
1004
    assert (pSrc->pDrawable == pMask->pDrawable);
892
1005
    
893
    while (height--)
1006
    while (height--)
894
    {
1007
    {
895
	dst = dstLine;
1008
	dst = dstLine;
Lines 897-910 Link Here
897
	src = srcLine;
1010
	src = srcLine;
898
	srcLine += srcStride;
1011
	srcLine += srcStride;
899
	w = width;
1012
	w = width;
900
1013
	
901
	CHECKPOINT();
1014
	CHECKPOINT();
902
	
1015
	
903
	while (w && (unsigned long)dst & 7)
1016
	while (w && (unsigned long)dst & 7)
904
	{
1017
	{
905
	    Vector4x16 vsrc = load8888 (*src);
1018
	    __m64 vsrc = load8888 (*src);
906
	    ullong d = *dst;
1019
	    ullong d = *dst;
907
	    Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1020
	    __m64 vdest = expand565 ((__m64)d, 0);
908
	    
1021
	    
909
	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1022
	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
910
	    
1023
	    
Lines 914-932 Link Here
914
	    dst++;
1027
	    dst++;
915
	    src++;
1028
	    src++;
916
	}
1029
	}
917
1030
	
918
	CHECKPOINT();
1031
	CHECKPOINT();
919
	
1032
	
920
	while (w >= 4)
1033
	while (w >= 4)
921
	{
1034
	{
922
	    CARD32 s0, s1, s2, s3;
1035
	    CARD32 s0, s1, s2, s3;
923
	    unsigned char a0, a1, a2, a3;
1036
	    unsigned char a0, a1, a2, a3;
924
1037
	    
925
	    s0 = *src;
1038
	    s0 = *src;
926
	    s1 = *(src + 1);
1039
	    s1 = *(src + 1);
927
	    s2 = *(src + 2);
1040
	    s2 = *(src + 2);
928
	    s3 = *(src + 3);
1041
	    s3 = *(src + 3);
929
1042
	    
930
	    a0 = (s0 >> 24);
1043
	    a0 = (s0 >> 24);
931
	    a1 = (s1 >> 24);
1044
	    a1 = (s1 >> 24);
932
	    a2 = (s2 >> 24);
1045
	    a2 = (s2 >> 24);
Lines 934-971 Link Here
934
	    
1047
	    
935
	    if ((a0 & a1 & a2 & a3) == 0xFF)
1048
	    if ((a0 & a1 & a2 & a3) == 0xFF)
936
	    {
1049
	    {
937
		Vector4x16 vdest;
1050
		__m64 vdest;
938
		vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0);
1051
		vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
939
		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
1052
		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
940
		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
1053
		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
941
		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
1054
		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
942
1055
		
943
		*(Vector4x16 *)dst = vdest;
1056
		*(__m64 *)dst = vdest;
944
	    }
1057
	    }
945
	    else if (a0 | a1 | a2 | a3)
1058
	    else if (a0 | a1 | a2 | a3)
946
	    {
1059
	    {
947
		Vector4x16 vdest = *(Vector4x16 *)dst;
1060
		__m64 vdest = *(__m64 *)dst;
948
1061
		
949
		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
1062
		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
950
	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
1063
	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
951
		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
1064
		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
952
		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
1065
		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
953
1066
		
954
		*(Vector4x16 *)dst = vdest;
1067
		*(__m64 *)dst = vdest;
955
	    }
1068
	    }
956
	    
1069
	    
957
	    w -= 4;
1070
	    w -= 4;
958
	    dst += 4;
1071
	    dst += 4;
959
	    src += 4;
1072
	    src += 4;
960
	}
1073
	}
961
1074
	
962
	CHECKPOINT();
1075
	CHECKPOINT();
963
	
1076
	
964
	while (w)
1077
	while (w)
965
	{
1078
	{
966
	    Vector4x16 vsrc = load8888 (*src);
1079
	    __m64 vsrc = load8888 (*src);
967
	    ullong d = *dst;
1080
	    ullong d = *dst;
968
	    Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1081
	    __m64 vdest = expand565 ((__m64)d, 0);
969
	    
1082
	    
970
	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
1083
	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
971
	    
1084
	    
Lines 976-986 Link Here
976
	    src++;
1089
	    src++;
977
	}
1090
	}
978
    }
1091
    }
979
1092
    
980
    emms();
1093
    _mm_empty();
981
}
1094
}
982
1095
983
/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
1096
/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
984
1097
985
void
1098
void
986
fbCompositeSrc_8888RevNPx8888mmx (CARD8      op,
1099
fbCompositeSrc_8888RevNPx8888mmx (CARD8      op,
Lines 1005-1013 Link Here
1005
    
1118
    
1006
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1119
    fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
1007
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1120
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
1008
1121
    
1009
    assert (pSrc->pDrawable == pMask->pDrawable);
1122
    assert (pSrc->pDrawable == pMask->pDrawable);
1010
1123
    
1011
    while (height--)
1124
    while (height--)
1012
    {
1125
    {
1013
	dst = dstLine;
1126
	dst = dstLine;
Lines 1015-1042 Link Here
1015
	src = srcLine;
1128
	src = srcLine;
1016
	srcLine += srcStride;
1129
	srcLine += srcStride;
1017
	w = width;
1130
	w = width;
1018
1131
	
1019
	while (w && (unsigned long)dst & 7)
1132
	while (w && (unsigned long)dst & 7)
1020
	{
1133
	{
1021
	    Vector4x16 s = load8888 (*src);
1134
	    __m64 s = load8888 (*src);
1022
	    Vector4x16 d = load8888 (*dst);
1135
	    __m64 d = load8888 (*dst);
1023
	    
1136
	    
1024
	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
1137
	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1025
	    
1138
	    
1026
	    w--;
1139
	    w--;
1027
	    dst++;
1140
	    dst++;
1028
	    src++;
1141
	    src++;
1029
	}
1142
	}
1030
1143
	
1031
	while (w >= 2)
1144
	while (w >= 2)
1032
	{
1145
	{
1033
	    ullong s0, s1;
1146
	    ullong s0, s1;
1034
	    unsigned char a0, a1;
1147
	    unsigned char a0, a1;
1035
	    Vector4x16 d0, d1;
1148
	    __m64 d0, d1;
1036
1149
	    
1037
	    s0 = *src;
1150
	    s0 = *src;
1038
	    s1 = *(src + 1);
1151
	    s1 = *(src + 1);
1039
1152
	    
1040
	    a0 = (s0 >> 24);
1153
	    a0 = (s0 >> 24);
1041
	    a1 = (s1 >> 24);
1154
	    a1 = (s1 >> 24);
1042
	    
1155
	    
Lines 1044-1060 Link Here
1044
	    {
1157
	    {
1045
		d0 = invert_colors(load8888(s0));
1158
		d0 = invert_colors(load8888(s0));
1046
		d1 = invert_colors(load8888(s1));
1159
		d1 = invert_colors(load8888(s1));
1047
1160
		
1048
		*(Vector8x8 *)dst = pack8888 (d0, d1);
1161
		*(__m64 *)dst = pack8888 (d0, d1);
1049
	    }
1162
	    }
1050
	    else if (a0 | a1)
1163
	    else if (a0 | a1)
1051
	    {
1164
	    {
1052
		Vector4x16 vdest = *(Vector4x16 *)dst;
1165
		__m64 vdest = *(__m64 *)dst;
1053
1166
		
1054
		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
1167
		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
1055
		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
1168
		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
1056
	    
1169
		
1057
		*(Vector8x8 *)dst = pack8888 (d0, d1);
1170
		*(__m64 *)dst = pack8888 (d0, d1);
1058
	    }
1171
	    }
1059
	    
1172
	    
1060
	    w -= 2;
1173
	    w -= 2;
Lines 1064-1081 Link Here
1064
	
1177
	
1065
	while (w)
1178
	while (w)
1066
	{
1179
	{
1067
	    Vector4x16 s = load8888 (*src);
1180
	    __m64 s = load8888 (*src);
1068
	    Vector4x16 d = load8888 (*dst);
1181
	    __m64 d = load8888 (*dst);
1069
	    
1182
	    
1070
	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero);
1183
	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
1071
	    
1184
	    
1072
	    w--;
1185
	    w--;
1073
	    dst++;
1186
	    dst++;
1074
	    src++;
1187
	    src++;
1075
	}
1188
	}
1076
    }
1189
    }
1077
1190
    
1078
    emms();
1191
    _mm_empty();
1079
}
1192
}
1080
1193
1081
void
1194
void
Lines 1096-1102 Link Here
1096
    CARD16	*dstLine;
1209
    CARD16	*dstLine;
1097
    CARD32	*maskLine;
1210
    CARD32	*maskLine;
1098
    FbStride	dstStride, maskStride;
1211
    FbStride	dstStride, maskStride;
1099
    Vector4x16  vsrc, vsrca;
1212
    __m64  vsrc, vsrca;
1100
    
1213
    
1101
    CHECKPOINT();
1214
    CHECKPOINT();
1102
    
1215
    
Lines 1125-1131 Link Here
1125
	    if (m)
1238
	    if (m)
1126
	    {
1239
	    {
1127
		ullong d = *q;
1240
		ullong d = *q;
1128
		Vector4x16 vdest = expand565 ((Vector4x16)d, 0);
1241
		__m64 vdest = expand565 ((__m64)d, 0);
1129
		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
1242
		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
1130
		*q = (ullong)vdest;
1243
		*q = (ullong)vdest;
1131
	    }
1244
	    }
Lines 1146-1159 Link Here
1146
	    
1259
	    
1147
	    if ((m0 | m1 | m2 | m3))
1260
	    if ((m0 | m1 | m2 | m3))
1148
	    {
1261
	    {
1149
		Vector4x16 vdest = *(Vector4x16 *)q;
1262
		__m64 vdest = *(__m64 *)q;
1150
		
1263
		
1151
		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
1264
		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
1152
		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
1265
		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
1153
		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
1266
		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
1154
		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
1267
		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
1155
		
1268
		
1156
		*(Vector4x16 *)q = vdest;
1269
		*(__m64 *)q = vdest;
1157
	    }
1270
	    }
1158
	    twidth -= 4;
1271
	    twidth -= 4;
1159
	    p += 4;
1272
	    p += 4;
Lines 1168-1174 Link Here
1168
	    if (m)
1281
	    if (m)
1169
	    {
1282
	    {
1170
		ullong d = *q;
1283
		ullong d = *q;
1171
		Vector4x16 vdest = expand565((Vector4x16)d, 0);
1284
		__m64 vdest = expand565((__m64)d, 0);
1172
		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
1285
		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
1173
		*q = (ullong)vdest;
1286
		*q = (ullong)vdest;
1174
	    }
1287
	    }
Lines 1182-1188 Link Here
1182
	dstLine += dstStride;
1295
	dstLine += dstStride;
1183
    }
1296
    }
1184
    
1297
    
1185
    emms ();
1298
    _mm_empty ();
1186
}
1299
}
1187
1300
1188
void
1301
void
Lines 1210-1216 Link Here
1210
    
1323
    
1211
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
1324
    fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
1212
    fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
1325
    fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
1213
1326
    
1214
    while (height--)
1327
    while (height--)
1215
    {
1328
    {
1216
	dst = dstLine;
1329
	dst = dstLine;
Lines 1218-1224 Link Here
1218
	src = srcLine;
1331
	src = srcLine;
1219
	srcLine += srcStride;
1332
	srcLine += srcStride;
1220
	w = width;
1333
	w = width;
1221
1334
	
1222
	while (w && (unsigned long)dst & 7)
1335
	while (w && (unsigned long)dst & 7)
1223
	{
1336
	{
1224
	    s = *src;
1337
	    s = *src;
Lines 1234-1246 Link Here
1234
	
1347
	
1235
	while (w >= 8)
1348
	while (w >= 8)
1236
	{
1349
	{
1237
	    __asm__ __volatile__ (
1350
	    *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1238
		"movq (%0), %%mm2\n\t"
1239
		"movq (%1), %%mm3\n\t"
1240
		"paddusb %%mm2, %%mm3\n\t"
1241
		"movq %%mm3, (%1)\n\t"
1242
		: /* no output */ : "r" (src), "r" (dst));
1243
	    
1244
	    dst += 8;
1351
	    dst += 8;
1245
	    src += 8;
1352
	    src += 8;
1246
	    w -= 8;
1353
	    w -= 8;
Lines 1259-1266 Link Here
1259
	    w--;
1366
	    w--;
1260
	}
1367
	}
1261
    }
1368
    }
1262
1369
    
1263
    emms();
1370
    _mm_empty();
1264
}
1371
}
1265
1372
1266
void
1373
void
Lines 1297-1309 Link Here
1297
	
1404
	
1298
	while (w && (unsigned long)dst & 7)
1405
	while (w && (unsigned long)dst & 7)
1299
	{
1406
	{
1300
	    __asm__ __volatile__ (
1407
	    *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1301
		"movd %0, %%mm2\n\t"
1408
						 _mm_cvtsi32_si64(*dst)));
1302
		"movd %1, %%mm3\n\t"
1303
		"paddusb %%mm2, %%mm3\n\t"
1304
		"movd %%mm3, %1\n\t"
1305
		: /* no output */ : "m" (*src), "m" (*dst));
1306
	    
1307
	    dst++;
1409
	    dst++;
1308
	    src++;
1410
	    src++;
1309
	    w--;
1411
	    w--;
Lines 1311-1323 Link Here
1311
	
1413
	
1312
	while (w >= 2)
1414
	while (w >= 2)
1313
	{
1415
	{
1314
	    __asm__ __volatile__ (
1416
	    *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
1315
		"movq (%0), %%mm2\n\t"
1316
		"movq (%1), %%mm3\n\t"
1317
		"paddusb %%mm2, %%mm3\n\t"
1318
		"movq %%mm3, (%1)\n\t"
1319
		: /* no output */ : "r" (src), "r" (dst));
1320
	    
1321
	    dst += 2;
1417
	    dst += 2;
1322
	    src += 2;
1418
	    src += 2;
1323
	    w -= 2;
1419
	    w -= 2;
Lines 1325-1340 Link Here
1325
	
1421
	
1326
	if (w)
1422
	if (w)
1327
	{
1423
	{
1328
	    __asm__ __volatile__ (
1424
	    *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
1329
		"movd %0, %%mm2\n\t"
1425
						 _mm_cvtsi32_si64(*dst)));
1330
		"movd %1, %%mm3\n\t"
1426
	    
1331
		"paddusb %%mm2, %%mm3\n\t"
1332
		"movd %%mm3, %1\n\t"
1333
		: /* no output */ : "m" (*src), "m" (*dst));
1334
	}
1427
	}
1335
    }
1428
    }
1336
1429
    
1337
    emms();
1430
    _mm_empty();
1338
}
1431
}
1339
1432
1340
#define GetStart(drw,x,y,type,stride,line,bpp) {\
1433
#define GetStart(drw,x,y,type,stride,line,bpp) {\
Lines 1358-1376 Link Here
1358
    FbStride	stride;
1451
    FbStride	stride;
1359
    int		bpp;
1452
    int		bpp;
1360
    ullong	fill;
1453
    ullong	fill;
1361
    Vector8x8	vfill;
1454
    __m64	vfill;
1362
    CARD32	byte_width;
1455
    CARD32	byte_width;
1363
    CARD8	*byte_line;
1456
    CARD8	*byte_line;
1364
    FbBits      *bits;
1457
    FbBits      *bits;
1365
    int		xoff, yoff;
1458
    int		xoff, yoff;
1366
    
1459
    
1367
    CHECKPOINT();
1460
    CHECKPOINT();
1368
1461
    
1369
    fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
1462
    fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
1370
1463
    
1371
    if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1464
    if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
1372
	return FALSE;
1465
	return FALSE;
1373
1466
    
1374
    if (bpp != 16 && bpp != 32)
1467
    if (bpp != 16 && bpp != 32)
1375
	return FALSE;
1468
	return FALSE;
1376
    
1469
    
Lines 1388-1396 Link Here
1388
	byte_width = 4 * width;
1481
	byte_width = 4 * width;
1389
	stride *= 4;
1482
	stride *= 4;
1390
    }
1483
    }
1391
1484
    
1392
    fill = ((ullong)xor << 32) | xor;
1485
    fill = ((ullong)xor << 32) | xor;
1393
    vfill = (Vector8x8)fill;
1486
    vfill = (__m64)fill;
1394
    
1487
    
1395
    while (height--)
1488
    while (height--)
1396
    {
1489
    {
Lines 1398-1404 Link Here
1398
	CARD8 *d = byte_line;
1491
	CARD8 *d = byte_line;
1399
	byte_line += stride;
1492
	byte_line += stride;
1400
	w = byte_width;
1493
	w = byte_width;
1401
1494
	
1402
	while (w >= 2 && ((unsigned long)d & 3))
1495
	while (w >= 2 && ((unsigned long)d & 3))
1403
	{
1496
	{
1404
	    *(CARD16 *)d = xor;
1497
	    *(CARD16 *)d = xor;
Lines 1406-1440 Link Here
1406
	    d += 2;
1499
	    d += 2;
1407
	}
1500
	}
1408
	
1501
	
1409
	while (w >= 4 && ((unsigned int)d & 7))
1502
	while (w >= 4 && ((unsigned long)d & 7))
1410
	{
1503
	{
1411
	    *(CARD32 *)d = xor;
1504
	    *(CARD32 *)d = xor;
1412
1505
	    
1413
	    w -= 4;
1506
	    w -= 4;
1414
	    d += 4;
1507
	    d += 4;
1415
	}
1508
	}
1416
	
1509
	
1417
	while (w >= 64)
1510
	while (w >= 64)
1418
	{
1511
	{
1419
	    __asm__ __volatile  (
1512
	    *(__m64*) (d +  0) = vfill;
1420
		"movq %0, (%1)\n\t"
1513
	    *(__m64*) (d +  8) = vfill;
1421
		"movq %0, 8(%1)\n\t"
1514
	    *(__m64*) (d + 16) = vfill;
1422
		"movq %0, 16(%1)\n\t"
1515
	    *(__m64*) (d + 24) = vfill;
1423
		"movq %0, 24(%1)\n\t"
1516
	    *(__m64*) (d + 32) = vfill;
1424
		"movq %0, 32(%1)\n\t"
1517
	    *(__m64*) (d + 40) = vfill;
1425
		"movq %0, 40(%1)\n\t"
1518
	    *(__m64*) (d + 48) = vfill;
1426
		"movq %0, 48(%1)\n\t"
1519
	    *(__m64*) (d + 56) = vfill;
1427
		"movq %0, 56(%1)\n\t"
1520
	    
1428
		: /* no output */
1429
		: "y" (vfill), "r" (d)
1430
		: "memory");
1431
	    w -= 64;
1521
	    w -= 64;
1432
	    d += 64;
1522
	    d += 64;
1433
	}
1523
	}
1434
	while (w >= 4)
1524
	while (w >= 4)
1435
	{
1525
	{
1436
	    *(CARD32 *)d = xor;
1526
	    *(CARD32 *)d = xor;
1437
1527
	    
1438
	    w -= 4;
1528
	    w -= 4;
1439
	    d += 4;
1529
	    d += 4;
1440
	}
1530
	}
Lines 1446-1461 Link Here
1446
	}
1536
	}
1447
    }
1537
    }
1448
    
1538
    
1449
    emms();
1539
    _mm_empty();
1540
    return TRUE;
1541
}
1542
1543
Bool
1544
fbCopyAreammx (DrawablePtr	pSrc,
1545
	       DrawablePtr	pDst,
1546
	       int		src_x,
1547
	       int		src_y,
1548
	       int		dst_x,
1549
	       int		dst_y,
1550
	       int		width,
1551
	       int		height)
1552
{
1553
    FbBits *	src_bits;
1554
    FbStride	src_stride;
1555
    int		src_bpp;
1556
    int		src_xoff;
1557
    int		src_yoff;
1558
1559
    FbBits *	dst_bits;
1560
    FbStride	dst_stride;
1561
    int		dst_bpp;
1562
    int		dst_xoff;
1563
    int		dst_yoff;
1564
1565
    CARD8 *	src_bytes;
1566
    CARD8 *	dst_bytes;
1567
    int		byte_width;
1568
    
1569
    fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
1570
    fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
1571
1572
    if (src_bpp != 16 && src_bpp != 32)
1573
	return FALSE;
1574
1575
    if (dst_bpp != 16 && dst_bpp != 32)
1576
	return FALSE;
1577
1578
    if (src_bpp != dst_bpp)
1579
    {
1580
	return FALSE;
1581
    }
1582
    
1583
    if (src_bpp == 16)
1584
    {
1585
	src_stride = src_stride * sizeof (FbBits) / 2;
1586
	dst_stride = dst_stride * sizeof (FbBits) / 2;
1587
	src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1588
	dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1589
	byte_width = 2 * width;
1590
	src_stride *= 2;
1591
	dst_stride *= 2;
1592
    }
1593
    else
1594
    {
1595
	src_stride = src_stride * sizeof (FbBits) / 4;
1596
	dst_stride = dst_stride * sizeof (FbBits) / 4;
1597
	src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
1598
	dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
1599
	byte_width = 4 * width;
1600
	src_stride *= 4;
1601
	dst_stride *= 4;
1602
    }
1603
1604
    while (height--)
1605
    {
1606
	int w;
1607
	CARD8 *s = src_bytes;
1608
	CARD8 *d = dst_bytes;
1609
	src_bytes += src_stride;
1610
	dst_bytes += dst_stride;
1611
	w = byte_width;
1612
	
1613
	while (w >= 2 && ((unsigned long)d & 3))
1614
	{
1615
	    *(CARD16 *)d = *(CARD16 *)s;
1616
	    w -= 2;
1617
	    s += 2;
1618
	    d += 2;
1619
	}
1620
	
1621
	while (w >= 4 && ((unsigned int)d & 7))
1622
	{
1623
	    *(CARD32 *)d = *(CARD32 *)s;
1624
	    
1625
	    w -= 4;
1626
	    s += 4;
1627
	    d += 4;
1628
	}
1629
	
1630
	while (w >= 64)
1631
	{
1632
	    *(__m64 *)(d + 0)  = *(__m64 *)(s + 0);
1633
	    *(__m64 *)(d + 8)  = *(__m64 *)(s + 8);
1634
	    *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
1635
	    *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
1636
	    *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
1637
	    *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
1638
	    *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
1639
	    *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
1640
	    w -= 64;
1641
	    s += 64;
1642
	    d += 64;
1643
	}
1644
	while (w >= 4)
1645
	{
1646
	    *(CARD32 *)d = *(CARD32 *)s;
1647
1648
	    w -= 4;
1649
	    s += 4;
1650
	    d += 4;
1651
	}
1652
	if (w >= 2)
1653
	{
1654
	    *(CARD16 *)d = *(CARD16 *)s;
1655
	    w -= 2;
1656
	    s += 2;
1657
	    d += 2;
1658
	}
1659
    }
1660
    
1661
    _mm_empty();
1450
    return TRUE;
1662
    return TRUE;
1451
}
1663
}
1452
1664
1665
void
1666
fbCompositeCopyAreammx (CARD8		op,
1667
			PicturePtr	pSrc,
1668
			PicturePtr	pMask,
1669
			PicturePtr	pDst,
1670
			INT16		xSrc,
1671
			INT16		ySrc,
1672
			INT16		xMask,
1673
			INT16		yMask,
1674
			INT16		xDst,
1675
			INT16		yDst,
1676
			CARD16		width,
1677
			CARD16		height)
1678
{
1679
    fbCopyAreammx (pSrc->pDrawable,
1680
		   pDst->pDrawable,
1681
		   xSrc, ySrc,
1682
		   xDst, yDst,
1683
		   width, height);
1684
}
1685
1686
#ifndef __amd64__
1453
Bool
1687
Bool
1454
fbHaveMMX (void)
1688
fbHaveMMX (void)
1455
{
1689
{
1456
    static Bool initialized = FALSE;
1690
    static Bool initialized = FALSE;
1457
    static Bool mmx_present;
1691
    static Bool mmx_present;
1458
1692
    
1459
    if (!initialized)
1693
    if (!initialized)
1460
    {
1694
    {
1461
	int tmp; /* static variables are accessed through %ebx,
1695
	int tmp; /* static variables are accessed through %ebx,
Lines 1466-1472 Link Here
1466
	
1700
	
1467
	__asm__ __volatile__ (
1701
	__asm__ __volatile__ (
1468
/* Check if bit 21 in flags word is writeable */
1702
/* Check if bit 21 in flags word is writeable */
1469
1703
	    
1470
	    "pusha			        \n\t"
1704
	    "pusha			        \n\t"
1471
	    "pushfl				\n\t"
1705
	    "pushfl				\n\t"
1472
	    "popl	%%eax			\n\t"
1706
	    "popl	%%eax			\n\t"
Lines 1502-1514 Link Here
1502
	    : /* no input */);
1736
	    : /* no input */);
1503
	
1737
	
1504
	initialized = TRUE;
1738
	initialized = TRUE;
1505
1739
	
1506
	mmx_present = tmp;
1740
	mmx_present = tmp;
1507
    }
1741
    }
1508
    
1742
    
1509
    return mmx_present;
1743
    return mmx_present;
1510
}
1744
}
1745
#endif /* __amd64__ */
1511
1746
1512
1747
1513
#endif /* RENDER */
1748
#endif /* RENDER */
1514
#endif /* USE_GCC34_MMX */
1749
#endif /* USE_MMX */

Return to bug 80685