Lines 1-5
Link Here
|
1 |
/* |
1 |
/* |
2 |
* Copyright © 2004 Red Hat, Inc. |
2 |
* Copyright © 2004 Red Hat, Inc. |
|
|
3 |
* Copyright © 2004 Nicholas Miell |
3 |
* |
4 |
* |
4 |
* Permission to use, copy, modify, distribute, and sell this software and its |
5 |
* Permission to use, copy, modify, distribute, and sell this software and its |
5 |
* documentation for any purpose is hereby granted without fee, provided that |
6 |
* documentation for any purpose is hereby granted without fee, provided that |
Lines 18-31
Link Here
|
18 |
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
19 |
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
19 |
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
20 |
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
20 |
* |
21 |
* |
21 |
* Author: Søren Sandmann (sandmann@redhat.com) |
22 |
* Author: Søren Sandmann (sandmann@redhat.com) |
22 |
* |
23 |
* Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
|
|
24 |
* |
23 |
* Based on work by Owen Taylor |
25 |
* Based on work by Owen Taylor |
24 |
*/ |
26 |
*/ |
25 |
|
27 |
|
|
|
28 |
|
29 |
#ifdef USE_MMX |
30 |
|
26 |
#include "fb.h" |
31 |
#include "fb.h" |
|
|
32 |
#include "fbmmx.h" |
33 |
|
34 |
#include <mmintrin.h> |
27 |
|
35 |
|
28 |
#ifdef USE_GCC34_MMX |
36 |
#ifdef USE_SSE |
|
|
37 |
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
38 |
#endif |
29 |
|
39 |
|
30 |
#ifdef RENDER |
40 |
#ifdef RENDER |
31 |
|
41 |
|
Lines 33-43
Link Here
|
33 |
#include "mipict.h" |
43 |
#include "mipict.h" |
34 |
#include "fbpict.h" |
44 |
#include "fbpict.h" |
35 |
|
45 |
|
36 |
typedef int Vector1x64 __attribute__ ((mode(DI))); |
|
|
37 |
typedef int Vector2x32 __attribute__ ((mode(V2SI))); |
38 |
typedef int Vector4x16 __attribute__ ((mode(V4HI))); |
39 |
typedef int Vector8x8 __attribute__ ((mode(V8QI))); |
40 |
|
41 |
typedef unsigned long long ullong; |
46 |
typedef unsigned long long ullong; |
42 |
|
47 |
|
43 |
#define noVERBOSE |
48 |
#define noVERBOSE |
Lines 50-56
Link Here
|
50 |
|
55 |
|
51 |
typedef struct |
56 |
typedef struct |
52 |
{ |
57 |
{ |
53 |
ullong mmx_zero; |
|
|
54 |
ullong mmx_4x00ff; |
58 |
ullong mmx_4x00ff; |
55 |
ullong mmx_4x0080; |
59 |
ullong mmx_4x0080; |
56 |
ullong mmx_565_rgb; |
60 |
ullong mmx_565_rgb; |
Lines 70-76
Link Here
|
70 |
|
74 |
|
71 |
static const MMXData c = |
75 |
static const MMXData c = |
72 |
{ |
76 |
{ |
73 |
.mmx_zero = 0x0000000000000000ULL, |
|
|
74 |
.mmx_4x00ff = 0x00ff00ff00ff00ffULL, |
77 |
.mmx_4x00ff = 0x00ff00ff00ff00ffULL, |
75 |
.mmx_4x0080 = 0x0080008000800080ULL, |
78 |
.mmx_4x0080 = 0x0080008000800080ULL, |
76 |
.mmx_565_rgb = 0x000001f0003f001fULL, |
79 |
.mmx_565_rgb = 0x000001f0003f001fULL, |
Lines 88-208
Link Here
|
88 |
.mmx_000000000000ffff = 0x000000000000ffffULL, |
91 |
.mmx_000000000000ffff = 0x000000000000ffffULL, |
89 |
}; |
92 |
}; |
90 |
|
93 |
|
91 |
static __inline__ Vector1x64 |
94 |
#define MC(x) ((__m64) c.mmx_##x) |
92 |
shift (Vector1x64 v, int s) |
95 |
|
|
|
96 |
static __inline__ __m64 |
97 |
shift (__m64 v, int s) |
93 |
{ |
98 |
{ |
94 |
if (s > 0) |
99 |
if (s > 0) |
95 |
return __builtin_ia32_psllq (v, s); |
100 |
return _mm_slli_si64 (v, s); |
96 |
else if (s < 0) |
101 |
else if (s < 0) |
97 |
return __builtin_ia32_psrlq (v, -s); |
102 |
return _mm_srli_si64 (v, -s); |
98 |
else |
103 |
else |
99 |
return v; |
104 |
return v; |
100 |
} |
105 |
} |
101 |
|
106 |
|
102 |
static __inline__ Vector4x16 |
107 |
static __inline__ __m64 |
103 |
negate (Vector4x16 mask) |
108 |
negate (__m64 mask) |
104 |
{ |
109 |
{ |
105 |
return (Vector4x16)__builtin_ia32_pxor ( |
110 |
return _mm_xor_si64 (mask, MC(4x00ff)); |
106 |
(Vector1x64)mask, |
|
|
107 |
(Vector1x64)c.mmx_4x00ff); |
108 |
} |
111 |
} |
109 |
|
112 |
|
110 |
static __inline__ Vector4x16 |
113 |
static __inline__ __m64 |
111 |
pix_multiply (Vector4x16 a, Vector4x16 b) |
114 |
pix_multiply (__m64 a, __m64 b) |
112 |
{ |
115 |
{ |
113 |
Vector4x16 res; |
116 |
__m64 res; |
114 |
|
117 |
|
115 |
res = __builtin_ia32_pmullw (a, b); |
118 |
res = _mm_mullo_pi16 (a, b); |
116 |
res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080); |
119 |
res = _mm_add_pi16 (res, MC(4x0080)); |
117 |
res = __builtin_ia32_psrlw (res, 8); |
120 |
res = _mm_srli_pi16 (res, 8); |
118 |
|
121 |
|
119 |
return res; |
122 |
return res; |
120 |
} |
123 |
} |
121 |
|
124 |
|
122 |
#if 0 |
125 |
#ifdef USE_SSE |
123 |
#define HAVE_PSHUFW |
126 |
#define HAVE_PSHUFW |
124 |
#endif |
127 |
#endif |
125 |
|
128 |
|
126 |
#ifdef HAVE_PSHUFW |
129 |
#ifdef HAVE_PSHUFW |
127 |
|
130 |
|
128 |
static __inline__ Vector4x16 |
131 |
static __inline__ __m64 |
129 |
expand_alpha (Vector4x16 pixel) |
132 |
expand_alpha (__m64 pixel) |
130 |
{ |
133 |
{ |
131 |
Vector4x16 result; |
134 |
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); |
132 |
__asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
|
|
133 |
return result; |
134 |
} |
135 |
} |
135 |
|
136 |
|
136 |
static __inline__ Vector4x16 |
137 |
static __inline__ __m64 |
137 |
expand_alpha_rev (Vector4x16 pixel) |
138 |
expand_alpha_rev (__m64 pixel) |
138 |
{ |
139 |
{ |
139 |
Vector4x16 result; |
140 |
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); |
140 |
__asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
|
|
141 |
return result; |
142 |
} |
141 |
} |
143 |
|
142 |
|
144 |
static __inline__ Vector4x16 |
143 |
static __inline__ __m64 |
145 |
invert_colors (Vector4x16 pixel) |
144 |
invert_colors (__m64 pixel) |
146 |
{ |
145 |
{ |
147 |
Vector4x16 result; |
146 |
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); |
148 |
|
|
|
149 |
/* 0xC6 = 11000110 */ |
150 |
/* 3 0 1 2 */ |
151 |
|
152 |
__asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
153 |
|
154 |
return result; |
155 |
} |
147 |
} |
156 |
|
148 |
|
157 |
#else |
149 |
#else |
158 |
|
150 |
|
159 |
static __inline__ Vector4x16 |
151 |
static __inline__ __m64 |
160 |
expand_alpha (Vector4x16 pixel) |
152 |
expand_alpha (__m64 pixel) |
161 |
{ |
153 |
{ |
162 |
Vector1x64 t1, t2; |
154 |
__m64 t1, t2; |
163 |
|
155 |
|
164 |
t1 = shift ((Vector1x64)pixel, -48); |
156 |
t1 = shift (pixel, -48); |
165 |
t2 = shift (t1, 16); |
157 |
t2 = shift (t1, 16); |
166 |
t1 = __builtin_ia32_por (t1, t2); |
158 |
t1 = _mm_or_si64 (t1, t2); |
167 |
t2 = shift (t1, 32); |
159 |
t2 = shift (t1, 32); |
168 |
t1 = __builtin_ia32_por (t1, t2); |
160 |
t1 = _mm_or_si64 (t1, t2); |
169 |
|
161 |
|
170 |
return (Vector4x16)t1; |
162 |
return t1; |
171 |
} |
163 |
} |
172 |
|
164 |
|
173 |
static __inline__ Vector4x16 |
165 |
static __inline__ __m64 |
174 |
expand_alpha_rev (Vector4x16 pixel) |
166 |
expand_alpha_rev (__m64 pixel) |
175 |
{ |
167 |
{ |
176 |
Vector1x64 t1, t2; |
168 |
__m64 t1, t2; |
177 |
|
169 |
|
178 |
t1 = shift ((Vector1x64)pixel, 48); |
170 |
/* move alpha to low 16 bits and zero the rest */ |
|
|
171 |
t1 = shift (pixel, 48); |
179 |
t1 = shift (t1, -48); |
172 |
t1 = shift (t1, -48); |
|
|
173 |
|
180 |
t2 = shift (t1, 16); |
174 |
t2 = shift (t1, 16); |
181 |
t1 = __builtin_ia32_por (t1, t2); |
175 |
t1 = _mm_or_si64 (t1, t2); |
182 |
t2 = shift (t1, 32); |
176 |
t2 = shift (t1, 32); |
183 |
t1 = __builtin_ia32_por (t1, t2); |
177 |
t1 = _mm_or_si64 (t1, t2); |
184 |
|
178 |
|
185 |
return (Vector4x16)t1; |
179 |
return t1; |
186 |
} |
180 |
} |
187 |
|
181 |
|
188 |
static __inline__ Vector4x16 |
182 |
static __inline__ __m64 |
189 |
invert_colors (Vector4x16 pixel) |
183 |
invert_colors (__m64 pixel) |
190 |
{ |
184 |
{ |
191 |
Vector1x64 x, y, z; |
185 |
__m64 x, y, z; |
192 |
|
186 |
|
193 |
x = y = z = (Vector1x64)pixel; |
187 |
x = y = z = pixel; |
194 |
|
188 |
|
195 |
x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000); |
189 |
x = _mm_and_si64 (x, MC(ffff0000ffff0000)); |
196 |
y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff); |
190 |
y = _mm_and_si64 (y, MC(000000000000ffff)); |
197 |
z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000); |
191 |
z = _mm_and_si64 (z, MC(0000ffff00000000)); |
198 |
|
192 |
|
199 |
y = shift (y, 32); |
193 |
y = shift (y, 32); |
200 |
z = shift (z, -32); |
194 |
z = shift (z, -32); |
201 |
|
195 |
|
202 |
x = __builtin_ia32_por (x, y); |
196 |
x = _mm_or_si64 (x, y); |
203 |
x = __builtin_ia32_por (x, z); |
197 |
x = _mm_or_si64 (x, z); |
204 |
|
198 |
|
205 |
return (Vector4x16)x; |
199 |
return x; |
206 |
} |
200 |
} |
207 |
|
201 |
|
208 |
#endif |
202 |
#endif |
Lines 210-356
Link Here
|
210 |
/* Notes about writing mmx code |
204 |
/* Notes about writing mmx code |
211 |
* |
205 |
* |
212 |
* give memory operands as the second operand. If you give it as the |
206 |
* give memory operands as the second operand. If you give it as the |
213 |
* first, gcc will first load it into a register, then use that register |
207 |
* first, gcc will first load it into a register, then use that |
|
|
208 |
* register |
214 |
* |
209 |
* |
215 |
* ie. use |
210 |
* ie. use |
216 |
* |
211 |
* |
217 |
* __builtin_pmullw (x, mmx_constant[8]); |
212 |
* _mm_mullo_pi16 (x, mmx_constant); |
218 |
* |
213 |
* |
219 |
* not |
214 |
* not |
220 |
* |
215 |
* |
221 |
* __builtin_pmullw (mmx_constant[8], x); |
216 |
* _mm_mullo_pi16 (mmx_constant, x); |
222 |
* |
217 |
* |
223 |
* Also try to minimize dependencies. Ie. when you need a value, try to calculate |
218 |
* Also try to minimize dependencies. i.e. when you need a value, try |
224 |
* it from a value that was calculated as early as possible. |
219 |
* to calculate it from a value that was calculated as early as |
|
|
220 |
* possible. |
225 |
*/ |
221 |
*/ |
226 |
|
222 |
|
227 |
static __inline__ Vector4x16 |
223 |
static __inline__ __m64 |
228 |
over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest) |
224 |
over (__m64 src, __m64 srca, __m64 dest) |
229 |
{ |
225 |
{ |
230 |
return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca))); |
226 |
return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); |
231 |
} |
227 |
} |
232 |
|
228 |
|
233 |
static __inline__ Vector4x16 |
229 |
static __inline__ __m64 |
234 |
over_rev_non_pre (Vector4x16 src, Vector4x16 dest) |
230 |
over_rev_non_pre (__m64 src, __m64 dest) |
235 |
{ |
231 |
{ |
236 |
Vector4x16 srca = expand_alpha (src); |
232 |
__m64 srca = expand_alpha (src); |
237 |
Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha); |
233 |
__m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); |
238 |
|
234 |
|
239 |
return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); |
235 |
return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); |
240 |
} |
236 |
} |
241 |
|
237 |
|
242 |
static __inline__ Vector4x16 |
238 |
static __inline__ __m64 |
243 |
in (Vector4x16 src, |
239 |
in (__m64 src, |
244 |
Vector4x16 mask) |
240 |
__m64 mask) |
245 |
{ |
241 |
{ |
246 |
return pix_multiply (src, mask); |
242 |
return pix_multiply (src, mask); |
247 |
} |
243 |
} |
248 |
|
244 |
|
249 |
static __inline__ Vector4x16 |
245 |
static __inline__ __m64 |
250 |
in_over (Vector4x16 src, |
246 |
in_over (__m64 src, |
251 |
Vector4x16 srca, |
247 |
__m64 srca, |
252 |
Vector4x16 mask, |
248 |
__m64 mask, |
253 |
Vector4x16 dest) |
249 |
__m64 dest) |
254 |
{ |
250 |
{ |
255 |
return over(in(src, mask), pix_multiply(srca, mask), dest); |
251 |
return over(in(src, mask), pix_multiply(srca, mask), dest); |
256 |
} |
252 |
} |
257 |
|
253 |
|
258 |
static __inline__ Vector8x8 |
254 |
static __inline__ __m64 |
259 |
cvt32to64 (CARD32 v) |
|
|
260 |
{ |
261 |
ullong r = v; |
262 |
return (Vector8x8)r; |
263 |
} |
264 |
|
265 |
static __inline__ Vector4x16 |
266 |
load8888 (CARD32 v) |
255 |
load8888 (CARD32 v) |
267 |
{ |
256 |
{ |
268 |
return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v), |
257 |
return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); |
269 |
(Vector8x8)c.mmx_zero); |
|
|
270 |
} |
258 |
} |
271 |
|
259 |
|
272 |
static __inline__ Vector8x8 |
260 |
static __inline__ __m64 |
273 |
pack8888 (Vector4x16 lo, Vector4x16 hi) |
261 |
pack8888 (__m64 lo, __m64 hi) |
274 |
{ |
262 |
{ |
275 |
Vector8x8 r; |
263 |
__m64 r; |
276 |
r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi); |
264 |
r = _mm_packs_pu16 (lo, hi); |
277 |
return r; |
265 |
return r; |
278 |
} |
266 |
} |
279 |
|
267 |
|
280 |
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB |
268 |
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
281 |
|
269 |
* |
282 |
--- Expanding 565 in the low word --- |
270 |
* 00RR00GG00BB |
283 |
|
271 |
* |
284 |
m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
272 |
* --- Expanding 565 in the low word --- |
285 |
m = m & (01f0003f001f); |
273 |
* |
286 |
m = m * (008404100840); |
274 |
* m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
287 |
m = m >> 8; |
275 |
* m = m & (01f0003f001f); |
288 |
|
276 |
* m = m * (008404100840); |
289 |
Note the trick here - the top word is shifted by another nibble to avoid |
277 |
* m = m >> 8; |
290 |
it bumping into the middle word |
278 |
* |
291 |
*/ |
279 |
* Note the trick here - the top word is shifted by another nibble to |
292 |
static __inline__ Vector4x16 |
280 |
* avoid it bumping into the middle word |
293 |
expand565 (Vector4x16 pixel, int pos) |
281 |
*/ |
|
|
282 |
static __inline__ __m64 |
283 |
expand565 (__m64 pixel, int pos) |
294 |
{ |
284 |
{ |
295 |
Vector1x64 p = (Vector1x64)pixel; |
285 |
__m64 p = pixel; |
|
|
286 |
__m64 t1, t2; |
296 |
|
287 |
|
297 |
/* move pixel to low 16 bit and zero the rest */ |
288 |
/* move pixel to low 16 bit and zero the rest */ |
298 |
p = shift (shift (p, (3 - pos) * 16), -48); |
289 |
p = shift (shift (p, (3 - pos) * 16), -48); |
299 |
|
290 |
|
300 |
Vector1x64 t1 = shift (p, 36 - 11); |
291 |
t1 = shift (p, 36 - 11); |
301 |
Vector1x64 t2 = shift (p, 16 - 5); |
292 |
t2 = shift (p, 16 - 5); |
302 |
|
293 |
|
303 |
p = __builtin_ia32_por (t1, p); |
294 |
p = _mm_or_si64 (t1, p); |
304 |
p = __builtin_ia32_por (t2, p); |
295 |
p = _mm_or_si64 (t2, p); |
305 |
p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb); |
296 |
p = _mm_and_si64 (p, MC(565_rgb)); |
306 |
|
297 |
|
307 |
pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier); |
298 |
pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); |
308 |
return __builtin_ia32_psrlw (pixel, 8); |
299 |
return _mm_srli_pi16 (pixel, 8); |
309 |
} |
300 |
} |
310 |
|
301 |
|
311 |
static __inline__ Vector4x16 |
302 |
static __inline__ __m64 |
312 |
expand8888 (Vector4x16 in, int pos) |
303 |
expand8888 (__m64 in, int pos) |
313 |
{ |
304 |
{ |
314 |
if (pos == 0) |
305 |
if (pos == 0) |
315 |
return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); |
306 |
return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); |
316 |
else |
307 |
else |
317 |
return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); |
308 |
return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); |
318 |
} |
309 |
} |
319 |
|
310 |
|
320 |
static __inline__ Vector4x16 |
311 |
static __inline__ __m64 |
321 |
pack565 (Vector4x16 pixel, Vector4x16 target, int pos) |
312 |
pack565 (__m64 pixel, __m64 target, int pos) |
322 |
{ |
313 |
{ |
323 |
Vector1x64 p = (Vector1x64)pixel; |
314 |
__m64 p = pixel; |
324 |
Vector1x64 t = (Vector1x64)target; |
315 |
__m64 t = target; |
325 |
Vector1x64 r, g, b; |
316 |
__m64 r, g, b; |
326 |
|
317 |
|
327 |
r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r); |
318 |
r = _mm_and_si64 (p, MC(565_r)); |
328 |
g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g); |
319 |
g = _mm_and_si64 (p, MC(565_g)); |
329 |
b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b); |
320 |
b = _mm_and_si64 (p, MC(565_b)); |
330 |
|
321 |
|
331 |
r = shift (r, - (32 - 8) + pos * 16); |
322 |
r = shift (r, - (32 - 8) + pos * 16); |
332 |
g = shift (g, - (16 - 3) + pos * 16); |
323 |
g = shift (g, - (16 - 3) + pos * 16); |
333 |
b = shift (b, - (0 + 3) + pos * 16); |
324 |
b = shift (b, - (0 + 3) + pos * 16); |
334 |
|
325 |
|
335 |
if (pos == 0) |
326 |
if (pos == 0) |
336 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0); |
327 |
t = _mm_and_si64 (t, MC(mask_0)); |
337 |
else if (pos == 1) |
328 |
else if (pos == 1) |
338 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1); |
329 |
t = _mm_and_si64 (t, MC(mask_1)); |
339 |
else if (pos == 2) |
330 |
else if (pos == 2) |
340 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2); |
331 |
t = _mm_and_si64 (t, MC(mask_2)); |
341 |
else if (pos == 3) |
332 |
else if (pos == 3) |
342 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3); |
333 |
t = _mm_and_si64 (t, MC(mask_3)); |
343 |
|
334 |
|
344 |
p = __builtin_ia32_por (r, t); |
335 |
p = _mm_or_si64 (r, t); |
345 |
p = __builtin_ia32_por (g, p); |
336 |
p = _mm_or_si64 (g, p); |
346 |
|
337 |
|
347 |
return (Vector4x16)__builtin_ia32_por (b, p); |
338 |
return _mm_or_si64 (b, p); |
348 |
} |
|
|
349 |
|
350 |
static __inline__ void |
351 |
emms (void) |
352 |
{ |
353 |
__asm__ __volatile__ ("emms"); |
354 |
} |
339 |
} |
355 |
|
340 |
|
356 |
void |
341 |
void |
Lines 371-378
Link Here
|
371 |
CARD32 *dstLine, *dst; |
356 |
CARD32 *dstLine, *dst; |
372 |
CARD16 w; |
357 |
CARD16 w; |
373 |
FbStride dstStride; |
358 |
FbStride dstStride; |
374 |
Vector4x16 vsrc, vsrca; |
359 |
__m64 vsrc, vsrca; |
375 |
|
360 |
|
376 |
CHECKPOINT(); |
361 |
CHECKPOINT(); |
377 |
|
362 |
|
378 |
fbComposeGetSolid(pSrc, src, pDst->format); |
363 |
fbComposeGetSolid(pSrc, src, pDst->format); |
Lines 384-434
Link Here
|
384 |
|
369 |
|
385 |
vsrc = load8888 (src); |
370 |
vsrc = load8888 (src); |
386 |
vsrca = expand_alpha (vsrc); |
371 |
vsrca = expand_alpha (vsrc); |
387 |
|
372 |
|
388 |
while (height--) |
373 |
while (height--) |
389 |
{ |
374 |
{ |
390 |
dst = dstLine; |
375 |
dst = dstLine; |
391 |
dstLine += dstStride; |
376 |
dstLine += dstStride; |
392 |
w = width; |
377 |
w = width; |
393 |
|
378 |
|
394 |
CHECKPOINT(); |
379 |
CHECKPOINT(); |
395 |
|
380 |
|
396 |
while (w && (unsigned long)dst & 7) |
381 |
while (w && (unsigned long)dst & 7) |
397 |
{ |
382 |
{ |
398 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); |
383 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), |
|
|
384 |
_mm_setzero_si64()); |
399 |
|
385 |
|
400 |
w--; |
386 |
w--; |
401 |
dst++; |
387 |
dst++; |
402 |
} |
388 |
} |
403 |
|
389 |
|
404 |
while (w >= 2) |
390 |
while (w >= 2) |
405 |
{ |
391 |
{ |
406 |
Vector4x16 vdest; |
392 |
__m64 vdest; |
407 |
Vector4x16 dest0, dest1; |
393 |
__m64 dest0, dest1; |
408 |
|
394 |
|
409 |
vdest = *(Vector4x16 *)dst; |
395 |
vdest = *(__m64 *)dst; |
410 |
|
396 |
|
411 |
dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); |
397 |
dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); |
412 |
dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); |
398 |
dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); |
413 |
|
399 |
|
414 |
*(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); |
400 |
*(__m64 *)dst = pack8888(dest0, dest1); |
415 |
|
401 |
|
416 |
dst += 2; |
402 |
dst += 2; |
417 |
w -= 2; |
403 |
w -= 2; |
418 |
} |
404 |
} |
419 |
|
405 |
|
420 |
CHECKPOINT(); |
406 |
CHECKPOINT(); |
421 |
|
407 |
|
422 |
while (w) |
408 |
while (w) |
423 |
{ |
409 |
{ |
424 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); |
410 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64()); |
425 |
|
411 |
|
426 |
w--; |
412 |
w--; |
427 |
dst++; |
413 |
dst++; |
428 |
} |
414 |
} |
429 |
} |
415 |
} |
430 |
|
416 |
|
431 |
emms(); |
417 |
_mm_empty(); |
432 |
} |
418 |
} |
433 |
|
419 |
|
434 |
void |
420 |
void |
Lines 449-456
Link Here
|
449 |
CARD16 *dstLine, *dst; |
435 |
CARD16 *dstLine, *dst; |
450 |
CARD16 w; |
436 |
CARD16 w; |
451 |
FbStride dstStride; |
437 |
FbStride dstStride; |
452 |
Vector4x16 vsrc, vsrca; |
438 |
__m64 vsrc, vsrca; |
453 |
|
439 |
|
454 |
CHECKPOINT(); |
440 |
CHECKPOINT(); |
455 |
|
441 |
|
456 |
fbComposeGetSolid(pSrc, src, pDst->format); |
442 |
fbComposeGetSolid(pSrc, src, pDst->format); |
Lines 462-510
Link Here
|
462 |
|
448 |
|
463 |
vsrc = load8888 (src); |
449 |
vsrc = load8888 (src); |
464 |
vsrca = expand_alpha (vsrc); |
450 |
vsrca = expand_alpha (vsrc); |
465 |
|
451 |
|
466 |
while (height--) |
452 |
while (height--) |
467 |
{ |
453 |
{ |
468 |
dst = dstLine; |
454 |
dst = dstLine; |
469 |
dstLine += dstStride; |
455 |
dstLine += dstStride; |
470 |
w = width; |
456 |
w = width; |
471 |
|
457 |
|
472 |
CHECKPOINT(); |
458 |
CHECKPOINT(); |
473 |
|
459 |
|
474 |
while (w && (unsigned long)dst & 7) |
460 |
while (w && (unsigned long)dst & 7) |
475 |
{ |
461 |
{ |
476 |
ullong d = *dst; |
462 |
ullong d = *dst; |
477 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
463 |
__m64 vdest = expand565 ((__m64)d, 0); |
478 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
464 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
479 |
*dst = (ullong)vdest; |
465 |
*dst = (ullong)vdest; |
480 |
|
466 |
|
481 |
w--; |
467 |
w--; |
482 |
dst++; |
468 |
dst++; |
483 |
} |
469 |
} |
484 |
|
470 |
|
485 |
while (w >= 4) |
471 |
while (w >= 4) |
486 |
{ |
472 |
{ |
487 |
Vector4x16 vdest; |
473 |
__m64 vdest; |
488 |
|
474 |
|
489 |
vdest = *(Vector4x16 *)dst; |
475 |
vdest = *(__m64 *)dst; |
490 |
|
476 |
|
491 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); |
477 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); |
492 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); |
478 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); |
493 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); |
479 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); |
494 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); |
480 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); |
495 |
|
481 |
|
496 |
*(Vector8x8 *)dst = (Vector8x8)vdest; |
482 |
*(__m64 *)dst = vdest; |
497 |
|
483 |
|
498 |
dst += 4; |
484 |
dst += 4; |
499 |
w -= 4; |
485 |
w -= 4; |
500 |
} |
486 |
} |
501 |
|
487 |
|
502 |
CHECKPOINT(); |
488 |
CHECKPOINT(); |
503 |
|
489 |
|
504 |
while (w) |
490 |
while (w) |
505 |
{ |
491 |
{ |
506 |
ullong d = *dst; |
492 |
ullong d = *dst; |
507 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
493 |
__m64 vdest = expand565 ((__m64)d, 0); |
508 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
494 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
509 |
*dst = (ullong)vdest; |
495 |
*dst = (ullong)vdest; |
510 |
|
496 |
|
Lines 513-519
Link Here
|
513 |
} |
499 |
} |
514 |
} |
500 |
} |
515 |
|
501 |
|
516 |
emms(); |
502 |
_mm_empty(); |
517 |
} |
503 |
} |
518 |
|
504 |
|
519 |
void |
505 |
void |
Lines 534-541
Link Here
|
534 |
CARD32 *dstLine; |
520 |
CARD32 *dstLine; |
535 |
CARD32 *maskLine; |
521 |
CARD32 *maskLine; |
536 |
FbStride dstStride, maskStride; |
522 |
FbStride dstStride, maskStride; |
537 |
Vector4x16 vsrc, vsrca; |
523 |
__m64 vsrc, vsrca; |
538 |
|
524 |
|
539 |
CHECKPOINT(); |
525 |
CHECKPOINT(); |
540 |
|
526 |
|
541 |
fbComposeGetSolid(pSrc, src, pDst->format); |
527 |
fbComposeGetSolid(pSrc, src, pDst->format); |
Lines 562-570
Link Here
|
562 |
|
548 |
|
563 |
if (m) |
549 |
if (m) |
564 |
{ |
550 |
{ |
565 |
Vector4x16 vdest = load8888(*q); |
551 |
__m64 vdest = load8888(*q); |
566 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
552 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
567 |
*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
553 |
*q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
568 |
} |
554 |
} |
569 |
|
555 |
|
570 |
twidth--; |
556 |
twidth--; |
Lines 580-594
Link Here
|
580 |
|
566 |
|
581 |
if (m0 | m1) |
567 |
if (m0 | m1) |
582 |
{ |
568 |
{ |
583 |
Vector4x16 dest0, dest1; |
569 |
__m64 dest0, dest1; |
584 |
Vector4x16 vdest = *(Vector4x16 *)q; |
570 |
__m64 vdest = *(__m64 *)q; |
585 |
|
571 |
|
586 |
dest0 = in_over(vsrc, vsrca, load8888(m0), |
572 |
dest0 = in_over(vsrc, vsrca, load8888(m0), |
587 |
expand8888 (vdest, 0)); |
573 |
expand8888 (vdest, 0)); |
588 |
dest1 = in_over(vsrc, vsrca, load8888(m1), |
574 |
dest1 = in_over(vsrc, vsrca, load8888(m1), |
589 |
expand8888 (vdest, 1)); |
575 |
expand8888 (vdest, 1)); |
590 |
|
576 |
|
591 |
*(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1); |
577 |
*(__m64 *)q = pack8888(dest0, dest1); |
592 |
} |
578 |
} |
593 |
|
579 |
|
594 |
p += 2; |
580 |
p += 2; |
Lines 602-610
Link Here
|
602 |
|
588 |
|
603 |
if (m) |
589 |
if (m) |
604 |
{ |
590 |
{ |
605 |
Vector4x16 vdest = load8888(*q); |
591 |
__m64 vdest = load8888(*q); |
606 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
592 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
607 |
*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
593 |
*q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
608 |
} |
594 |
} |
609 |
|
595 |
|
610 |
twidth--; |
596 |
twidth--; |
Lines 616-622
Link Here
|
616 |
maskLine += maskStride; |
602 |
maskLine += maskStride; |
617 |
} |
603 |
} |
618 |
|
604 |
|
619 |
emms(); |
605 |
_mm_empty(); |
|
|
606 |
} |
607 |
|
608 |
void |
609 |
fbCompositeSrc_8888x8x8888mmx (CARD8 op, |
610 |
PicturePtr pSrc, |
611 |
PicturePtr pMask, |
612 |
PicturePtr pDst, |
613 |
INT16 xSrc, |
614 |
INT16 ySrc, |
615 |
INT16 xMask, |
616 |
INT16 yMask, |
617 |
INT16 xDst, |
618 |
INT16 yDst, |
619 |
CARD16 width, |
620 |
CARD16 height) |
621 |
{ |
622 |
CARD32 *dstLine, *dst; |
623 |
CARD32 *srcLine, *src; |
624 |
CARD8 *maskLine; |
625 |
CARD32 mask; |
626 |
__m64 vmask; |
627 |
FbStride dstStride, srcStride, maskStride; |
628 |
CARD16 w; |
629 |
__m64 srca; |
630 |
|
631 |
CHECKPOINT(); |
632 |
|
633 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
634 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
635 |
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); |
636 |
|
637 |
mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; |
638 |
vmask = load8888 (mask); |
639 |
srca = MC(4x00ff); |
640 |
|
641 |
while (height--) |
642 |
{ |
643 |
dst = dstLine; |
644 |
dstLine += dstStride; |
645 |
src = srcLine; |
646 |
srcLine += srcStride; |
647 |
w = width; |
648 |
|
649 |
while (w && (unsigned long)dst & 7) |
650 |
{ |
651 |
__m64 s = load8888 (*src); |
652 |
__m64 d = load8888 (*dst); |
653 |
|
654 |
*dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
655 |
|
656 |
w--; |
657 |
dst++; |
658 |
src++; |
659 |
} |
660 |
|
661 |
while (w >= 16) |
662 |
{ |
663 |
__m64 vd0 = *(__m64 *)(dst + 0); |
664 |
__m64 vd1 = *(__m64 *)(dst + 2); |
665 |
__m64 vd2 = *(__m64 *)(dst + 4); |
666 |
__m64 vd3 = *(__m64 *)(dst + 6); |
667 |
__m64 vd4 = *(__m64 *)(dst + 8); |
668 |
__m64 vd5 = *(__m64 *)(dst + 10); |
669 |
__m64 vd6 = *(__m64 *)(dst + 12); |
670 |
__m64 vd7 = *(__m64 *)(dst + 14); |
671 |
|
672 |
__m64 vs0 = *(__m64 *)(src + 0); |
673 |
__m64 vs1 = *(__m64 *)(src + 2); |
674 |
__m64 vs2 = *(__m64 *)(src + 4); |
675 |
__m64 vs3 = *(__m64 *)(src + 6); |
676 |
__m64 vs4 = *(__m64 *)(src + 8); |
677 |
__m64 vs5 = *(__m64 *)(src + 10); |
678 |
__m64 vs6 = *(__m64 *)(src + 12); |
679 |
__m64 vs7 = *(__m64 *)(dst + 14); |
680 |
|
681 |
vd0 = (__m64)pack8888 ( |
682 |
in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
683 |
in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
684 |
|
685 |
vd1 = (__m64)pack8888 ( |
686 |
in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
687 |
in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
688 |
|
689 |
vd2 = (__m64)pack8888 ( |
690 |
in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
691 |
in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
692 |
|
693 |
vd3 = (__m64)pack8888 ( |
694 |
in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
695 |
in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
696 |
|
697 |
vd4 = (__m64)pack8888 ( |
698 |
in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
699 |
in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
700 |
|
701 |
vd5 = (__m64)pack8888 ( |
702 |
in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
703 |
in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
704 |
|
705 |
vd6 = (__m64)pack8888 ( |
706 |
in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
707 |
in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
708 |
|
709 |
vd7 = (__m64)pack8888 ( |
710 |
in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
711 |
in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
712 |
|
713 |
w -= 16; |
714 |
dst += 16; |
715 |
src += 16; |
716 |
} |
717 |
|
718 |
while (w) |
719 |
{ |
720 |
__m64 s = load8888 (*src); |
721 |
__m64 d = load8888 (*dst); |
722 |
|
723 |
*dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
724 |
|
725 |
w--; |
726 |
dst++; |
727 |
src++; |
728 |
} |
729 |
} |
730 |
|
731 |
_mm_empty(); |
620 |
} |
732 |
} |
621 |
|
733 |
|
622 |
void |
734 |
void |
Lines 638-644
Link Here
|
638 |
CARD8 *maskLine, *mask; |
750 |
CARD8 *maskLine, *mask; |
639 |
FbStride dstStride, maskStride; |
751 |
FbStride dstStride, maskStride; |
640 |
CARD16 w; |
752 |
CARD16 w; |
641 |
Vector4x16 vsrc, vsrca; |
753 |
__m64 vsrc, vsrca; |
642 |
ullong srcsrc; |
754 |
ullong srcsrc; |
643 |
|
755 |
|
644 |
CHECKPOINT(); |
756 |
CHECKPOINT(); |
Lines 648-654
Link Here
|
648 |
srca = src >> 24; |
760 |
srca = src >> 24; |
649 |
if (srca == 0) |
761 |
if (srca == 0) |
650 |
return; |
762 |
return; |
651 |
|
763 |
|
652 |
srcsrc = (unsigned long long)src << 32 | src; |
764 |
srcsrc = (unsigned long long)src << 32 | src; |
653 |
|
765 |
|
654 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
766 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
Lines 664-670
Link Here
|
664 |
mask = maskLine; |
776 |
mask = maskLine; |
665 |
maskLine += maskStride; |
777 |
maskLine += maskStride; |
666 |
w = width; |
778 |
w = width; |
667 |
|
779 |
|
668 |
CHECKPOINT(); |
780 |
CHECKPOINT(); |
669 |
|
781 |
|
670 |
while (w && (unsigned long)dst & 7) |
782 |
while (w && (unsigned long)dst & 7) |
Lines 673-687
Link Here
|
673 |
|
785 |
|
674 |
if (m) |
786 |
if (m) |
675 |
{ |
787 |
{ |
676 |
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst)); |
788 |
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); |
677 |
*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
789 |
*dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
678 |
} |
790 |
} |
679 |
|
791 |
|
680 |
w--; |
792 |
w--; |
681 |
mask++; |
793 |
mask++; |
682 |
dst++; |
794 |
dst++; |
683 |
} |
795 |
} |
684 |
|
796 |
|
685 |
CHECKPOINT(); |
797 |
CHECKPOINT(); |
686 |
|
798 |
|
687 |
while (w >= 2) |
799 |
while (w >= 2) |
Lines 689-717
Link Here
|
689 |
ullong m0, m1; |
801 |
ullong m0, m1; |
690 |
m0 = *mask; |
802 |
m0 = *mask; |
691 |
m1 = *(mask + 1); |
803 |
m1 = *(mask + 1); |
692 |
|
804 |
|
693 |
if (srca == 0xff && (m0 & m1) == 0xff) |
805 |
if (srca == 0xff && (m0 & m1) == 0xff) |
694 |
{ |
806 |
{ |
695 |
*(unsigned long long *)dst = srcsrc; |
807 |
*(unsigned long long *)dst = srcsrc; |
696 |
} |
808 |
} |
697 |
else if (m0 | m1) |
809 |
else if (m0 | m1) |
698 |
{ |
810 |
{ |
699 |
Vector4x16 vdest; |
811 |
__m64 vdest; |
700 |
Vector4x16 dest0, dest1; |
812 |
__m64 dest0, dest1; |
701 |
|
813 |
|
702 |
vdest = *(Vector4x16 *)dst; |
814 |
vdest = *(__m64 *)dst; |
703 |
|
815 |
|
704 |
dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0)); |
816 |
dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); |
705 |
dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1)); |
817 |
dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); |
706 |
|
818 |
|
707 |
*(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); |
819 |
*(__m64 *)dst = pack8888(dest0, dest1); |
708 |
} |
820 |
} |
709 |
|
821 |
|
710 |
mask += 2; |
822 |
mask += 2; |
711 |
dst += 2; |
823 |
dst += 2; |
712 |
w -= 2; |
824 |
w -= 2; |
713 |
} |
825 |
} |
714 |
|
826 |
|
715 |
CHECKPOINT(); |
827 |
CHECKPOINT(); |
716 |
|
828 |
|
717 |
while (w) |
829 |
while (w) |
Lines 720-728
Link Here
|
720 |
|
832 |
|
721 |
if (m) |
833 |
if (m) |
722 |
{ |
834 |
{ |
723 |
Vector4x16 vdest = load8888(*dst); |
835 |
__m64 vdest = load8888(*dst); |
724 |
vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest); |
836 |
vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); |
725 |
*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
837 |
*dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
726 |
} |
838 |
} |
727 |
|
839 |
|
728 |
w--; |
840 |
w--; |
Lines 731-737
Link Here
|
731 |
} |
843 |
} |
732 |
} |
844 |
} |
733 |
|
845 |
|
734 |
emms(); |
846 |
_mm_empty(); |
735 |
} |
847 |
} |
736 |
|
848 |
|
737 |
|
849 |
|
Lines 754-760
Link Here
|
754 |
CARD8 *maskLine, *mask; |
866 |
CARD8 *maskLine, *mask; |
755 |
FbStride dstStride, maskStride; |
867 |
FbStride dstStride, maskStride; |
756 |
CARD16 w; |
868 |
CARD16 w; |
757 |
Vector4x16 vsrc, vsrca; |
869 |
__m64 vsrc, vsrca; |
758 |
unsigned long long srcsrcsrcsrc, src16; |
870 |
unsigned long long srcsrcsrcsrc, src16; |
759 |
|
871 |
|
760 |
CHECKPOINT(); |
872 |
CHECKPOINT(); |
Lines 770-778
Link Here
|
770 |
|
882 |
|
771 |
vsrc = load8888 (src); |
883 |
vsrc = load8888 (src); |
772 |
vsrca = expand_alpha (vsrc); |
884 |
vsrca = expand_alpha (vsrc); |
773 |
|
885 |
|
774 |
src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0); |
886 |
src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); |
775 |
|
887 |
|
776 |
srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | |
888 |
srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | |
777 |
(ullong)src16 << 16 | (ullong)src16; |
889 |
(ullong)src16 << 16 | (ullong)src16; |
778 |
|
890 |
|
Lines 783-789
Link Here
|
783 |
mask = maskLine; |
895 |
mask = maskLine; |
784 |
maskLine += maskStride; |
896 |
maskLine += maskStride; |
785 |
w = width; |
897 |
w = width; |
786 |
|
898 |
|
787 |
CHECKPOINT(); |
899 |
CHECKPOINT(); |
788 |
|
900 |
|
789 |
while (w && (unsigned long)dst & 7) |
901 |
while (w && (unsigned long)dst & 7) |
Lines 793-808
Link Here
|
793 |
if (m) |
905 |
if (m) |
794 |
{ |
906 |
{ |
795 |
ullong d = *dst; |
907 |
ullong d = *dst; |
796 |
Vector4x16 vd = (Vector4x16)d; |
908 |
__m64 vd = (__m64)d; |
797 |
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); |
909 |
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
798 |
*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); |
910 |
*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
799 |
} |
911 |
} |
800 |
|
912 |
|
801 |
w--; |
913 |
w--; |
802 |
mask++; |
914 |
mask++; |
803 |
dst++; |
915 |
dst++; |
804 |
} |
916 |
} |
805 |
|
917 |
|
806 |
CHECKPOINT(); |
918 |
CHECKPOINT(); |
807 |
|
919 |
|
808 |
while (w >= 4) |
920 |
while (w >= 4) |
Lines 812-846
Link Here
|
812 |
m1 = *(mask + 1); |
924 |
m1 = *(mask + 1); |
813 |
m2 = *(mask + 2); |
925 |
m2 = *(mask + 2); |
814 |
m3 = *(mask + 3); |
926 |
m3 = *(mask + 3); |
815 |
|
927 |
|
816 |
if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
928 |
if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
817 |
{ |
929 |
{ |
818 |
*(unsigned long long *)dst = srcsrcsrcsrc; |
930 |
*(unsigned long long *)dst = srcsrcsrcsrc; |
819 |
} |
931 |
} |
820 |
else if (m0 | m1 | m2 | m3) |
932 |
else if (m0 | m1 | m2 | m3) |
821 |
{ |
933 |
{ |
822 |
Vector4x16 vdest; |
934 |
__m64 vdest; |
823 |
Vector4x16 vm0, vm1, vm2, vm3; |
935 |
__m64 vm0, vm1, vm2, vm3; |
824 |
|
936 |
|
825 |
vdest = *(Vector4x16 *)dst; |
937 |
vdest = *(__m64 *)dst; |
826 |
|
938 |
|
827 |
vm0 = (Vector4x16)m0; |
939 |
vm0 = (__m64)m0; |
828 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); |
940 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); |
829 |
vm1 = (Vector4x16)m1; |
941 |
vm1 = (__m64)m1; |
830 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); |
942 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); |
831 |
vm2 = (Vector4x16)m2; |
943 |
vm2 = (__m64)m2; |
832 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); |
944 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); |
833 |
vm3 = (Vector4x16)m3; |
945 |
vm3 = (__m64)m3; |
834 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); |
946 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); |
835 |
|
947 |
|
836 |
*(Vector4x16 *)dst = vdest; |
948 |
*(__m64 *)dst = vdest; |
837 |
} |
949 |
} |
838 |
|
950 |
|
839 |
w -= 4; |
951 |
w -= 4; |
840 |
mask += 4; |
952 |
mask += 4; |
841 |
dst += 4; |
953 |
dst += 4; |
842 |
} |
954 |
} |
843 |
|
955 |
|
844 |
CHECKPOINT(); |
956 |
CHECKPOINT(); |
845 |
|
957 |
|
846 |
while (w) |
958 |
while (w) |
Lines 850-858
Link Here
|
850 |
if (m) |
962 |
if (m) |
851 |
{ |
963 |
{ |
852 |
ullong d = *dst; |
964 |
ullong d = *dst; |
853 |
Vector4x16 vd = (Vector4x16)d; |
965 |
__m64 vd = (__m64)d; |
854 |
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); |
966 |
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
855 |
*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); |
967 |
*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
856 |
} |
968 |
} |
857 |
|
969 |
|
858 |
w--; |
970 |
w--; |
Lines 861-867
Link Here
|
861 |
} |
973 |
} |
862 |
} |
974 |
} |
863 |
|
975 |
|
864 |
emms(); |
976 |
_mm_empty(); |
865 |
} |
977 |
} |
866 |
|
978 |
|
867 |
void |
979 |
void |
Lines 887-895
Link Here
|
887 |
|
999 |
|
888 |
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
1000 |
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
889 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1001 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
890 |
|
1002 |
|
891 |
assert (pSrc->pDrawable == pMask->pDrawable); |
1003 |
assert (pSrc->pDrawable == pMask->pDrawable); |
892 |
|
1004 |
|
893 |
while (height--) |
1005 |
while (height--) |
894 |
{ |
1006 |
{ |
895 |
dst = dstLine; |
1007 |
dst = dstLine; |
Lines 897-910
Link Here
|
897 |
src = srcLine; |
1009 |
src = srcLine; |
898 |
srcLine += srcStride; |
1010 |
srcLine += srcStride; |
899 |
w = width; |
1011 |
w = width; |
900 |
|
1012 |
|
901 |
CHECKPOINT(); |
1013 |
CHECKPOINT(); |
902 |
|
1014 |
|
903 |
while (w && (unsigned long)dst & 7) |
1015 |
while (w && (unsigned long)dst & 7) |
904 |
{ |
1016 |
{ |
905 |
Vector4x16 vsrc = load8888 (*src); |
1017 |
__m64 vsrc = load8888 (*src); |
906 |
ullong d = *dst; |
1018 |
ullong d = *dst; |
907 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1019 |
__m64 vdest = expand565 ((__m64)d, 0); |
908 |
|
1020 |
|
909 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
1021 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
910 |
|
1022 |
|
Lines 914-932
Link Here
|
914 |
dst++; |
1026 |
dst++; |
915 |
src++; |
1027 |
src++; |
916 |
} |
1028 |
} |
917 |
|
1029 |
|
918 |
CHECKPOINT(); |
1030 |
CHECKPOINT(); |
919 |
|
1031 |
|
920 |
while (w >= 4) |
1032 |
while (w >= 4) |
921 |
{ |
1033 |
{ |
922 |
CARD32 s0, s1, s2, s3; |
1034 |
CARD32 s0, s1, s2, s3; |
923 |
unsigned char a0, a1, a2, a3; |
1035 |
unsigned char a0, a1, a2, a3; |
924 |
|
1036 |
|
925 |
s0 = *src; |
1037 |
s0 = *src; |
926 |
s1 = *(src + 1); |
1038 |
s1 = *(src + 1); |
927 |
s2 = *(src + 2); |
1039 |
s2 = *(src + 2); |
928 |
s3 = *(src + 3); |
1040 |
s3 = *(src + 3); |
929 |
|
1041 |
|
930 |
a0 = (s0 >> 24); |
1042 |
a0 = (s0 >> 24); |
931 |
a1 = (s1 >> 24); |
1043 |
a1 = (s1 >> 24); |
932 |
a2 = (s2 >> 24); |
1044 |
a2 = (s2 >> 24); |
Lines 934-971
Link Here
|
934 |
|
1046 |
|
935 |
if ((a0 & a1 & a2 & a3) == 0xFF) |
1047 |
if ((a0 & a1 & a2 & a3) == 0xFF) |
936 |
{ |
1048 |
{ |
937 |
Vector4x16 vdest; |
1049 |
__m64 vdest; |
938 |
vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0); |
1050 |
vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); |
939 |
vdest = pack565(invert_colors(load8888(s1)), vdest, 1); |
1051 |
vdest = pack565(invert_colors(load8888(s1)), vdest, 1); |
940 |
vdest = pack565(invert_colors(load8888(s2)), vdest, 2); |
1052 |
vdest = pack565(invert_colors(load8888(s2)), vdest, 2); |
941 |
vdest = pack565(invert_colors(load8888(s3)), vdest, 3); |
1053 |
vdest = pack565(invert_colors(load8888(s3)), vdest, 3); |
942 |
|
1054 |
|
943 |
*(Vector4x16 *)dst = vdest; |
1055 |
*(__m64 *)dst = vdest; |
944 |
} |
1056 |
} |
945 |
else if (a0 | a1 | a2 | a3) |
1057 |
else if (a0 | a1 | a2 | a3) |
946 |
{ |
1058 |
{ |
947 |
Vector4x16 vdest = *(Vector4x16 *)dst; |
1059 |
__m64 vdest = *(__m64 *)dst; |
948 |
|
1060 |
|
949 |
vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); |
1061 |
vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); |
950 |
vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); |
1062 |
vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); |
951 |
vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); |
1063 |
vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); |
952 |
vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); |
1064 |
vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); |
953 |
|
1065 |
|
954 |
*(Vector4x16 *)dst = vdest; |
1066 |
*(__m64 *)dst = vdest; |
955 |
} |
1067 |
} |
956 |
|
1068 |
|
957 |
w -= 4; |
1069 |
w -= 4; |
958 |
dst += 4; |
1070 |
dst += 4; |
959 |
src += 4; |
1071 |
src += 4; |
960 |
} |
1072 |
} |
961 |
|
1073 |
|
962 |
CHECKPOINT(); |
1074 |
CHECKPOINT(); |
963 |
|
1075 |
|
964 |
while (w) |
1076 |
while (w) |
965 |
{ |
1077 |
{ |
966 |
Vector4x16 vsrc = load8888 (*src); |
1078 |
__m64 vsrc = load8888 (*src); |
967 |
ullong d = *dst; |
1079 |
ullong d = *dst; |
968 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1080 |
__m64 vdest = expand565 ((__m64)d, 0); |
969 |
|
1081 |
|
970 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
1082 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
971 |
|
1083 |
|
Lines 976-986
Link Here
|
976 |
src++; |
1088 |
src++; |
977 |
} |
1089 |
} |
978 |
} |
1090 |
} |
979 |
|
1091 |
|
980 |
emms(); |
1092 |
_mm_empty(); |
981 |
} |
1093 |
} |
982 |
|
1094 |
|
983 |
/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
1095 |
/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
984 |
|
1096 |
|
985 |
void |
1097 |
void |
986 |
fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, |
1098 |
fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, |
Lines 1005-1013
Link Here
|
1005 |
|
1117 |
|
1006 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
1118 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
1007 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1119 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1008 |
|
1120 |
|
1009 |
assert (pSrc->pDrawable == pMask->pDrawable); |
1121 |
assert (pSrc->pDrawable == pMask->pDrawable); |
1010 |
|
1122 |
|
1011 |
while (height--) |
1123 |
while (height--) |
1012 |
{ |
1124 |
{ |
1013 |
dst = dstLine; |
1125 |
dst = dstLine; |
Lines 1015-1042
Link Here
|
1015 |
src = srcLine; |
1127 |
src = srcLine; |
1016 |
srcLine += srcStride; |
1128 |
srcLine += srcStride; |
1017 |
w = width; |
1129 |
w = width; |
1018 |
|
1130 |
|
1019 |
while (w && (unsigned long)dst & 7) |
1131 |
while (w && (unsigned long)dst & 7) |
1020 |
{ |
1132 |
{ |
1021 |
Vector4x16 s = load8888 (*src); |
1133 |
__m64 s = load8888 (*src); |
1022 |
Vector4x16 d = load8888 (*dst); |
1134 |
__m64 d = load8888 (*dst); |
1023 |
|
1135 |
|
1024 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); |
1136 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
1025 |
|
1137 |
|
1026 |
w--; |
1138 |
w--; |
1027 |
dst++; |
1139 |
dst++; |
1028 |
src++; |
1140 |
src++; |
1029 |
} |
1141 |
} |
1030 |
|
1142 |
|
1031 |
while (w >= 2) |
1143 |
while (w >= 2) |
1032 |
{ |
1144 |
{ |
1033 |
ullong s0, s1; |
1145 |
ullong s0, s1; |
1034 |
unsigned char a0, a1; |
1146 |
unsigned char a0, a1; |
1035 |
Vector4x16 d0, d1; |
1147 |
__m64 d0, d1; |
1036 |
|
1148 |
|
1037 |
s0 = *src; |
1149 |
s0 = *src; |
1038 |
s1 = *(src + 1); |
1150 |
s1 = *(src + 1); |
1039 |
|
1151 |
|
1040 |
a0 = (s0 >> 24); |
1152 |
a0 = (s0 >> 24); |
1041 |
a1 = (s1 >> 24); |
1153 |
a1 = (s1 >> 24); |
1042 |
|
1154 |
|
Lines 1044-1060
Link Here
|
1044 |
{ |
1156 |
{ |
1045 |
d0 = invert_colors(load8888(s0)); |
1157 |
d0 = invert_colors(load8888(s0)); |
1046 |
d1 = invert_colors(load8888(s1)); |
1158 |
d1 = invert_colors(load8888(s1)); |
1047 |
|
1159 |
|
1048 |
*(Vector8x8 *)dst = pack8888 (d0, d1); |
1160 |
*(__m64 *)dst = pack8888 (d0, d1); |
1049 |
} |
1161 |
} |
1050 |
else if (a0 | a1) |
1162 |
else if (a0 | a1) |
1051 |
{ |
1163 |
{ |
1052 |
Vector4x16 vdest = *(Vector4x16 *)dst; |
1164 |
__m64 vdest = *(__m64 *)dst; |
1053 |
|
1165 |
|
1054 |
d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); |
1166 |
d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); |
1055 |
d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); |
1167 |
d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); |
1056 |
|
1168 |
|
1057 |
*(Vector8x8 *)dst = pack8888 (d0, d1); |
1169 |
*(__m64 *)dst = pack8888 (d0, d1); |
1058 |
} |
1170 |
} |
1059 |
|
1171 |
|
1060 |
w -= 2; |
1172 |
w -= 2; |
Lines 1064-1081
Link Here
|
1064 |
|
1176 |
|
1065 |
while (w) |
1177 |
while (w) |
1066 |
{ |
1178 |
{ |
1067 |
Vector4x16 s = load8888 (*src); |
1179 |
__m64 s = load8888 (*src); |
1068 |
Vector4x16 d = load8888 (*dst); |
1180 |
__m64 d = load8888 (*dst); |
1069 |
|
1181 |
|
1070 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); |
1182 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
1071 |
|
1183 |
|
1072 |
w--; |
1184 |
w--; |
1073 |
dst++; |
1185 |
dst++; |
1074 |
src++; |
1186 |
src++; |
1075 |
} |
1187 |
} |
1076 |
} |
1188 |
} |
1077 |
|
1189 |
|
1078 |
emms(); |
1190 |
_mm_empty(); |
1079 |
} |
1191 |
} |
1080 |
|
1192 |
|
1081 |
void |
1193 |
void |
Lines 1096-1102
Link Here
|
1096 |
CARD16 *dstLine; |
1208 |
CARD16 *dstLine; |
1097 |
CARD32 *maskLine; |
1209 |
CARD32 *maskLine; |
1098 |
FbStride dstStride, maskStride; |
1210 |
FbStride dstStride, maskStride; |
1099 |
Vector4x16 vsrc, vsrca; |
1211 |
__m64 vsrc, vsrca; |
1100 |
|
1212 |
|
1101 |
CHECKPOINT(); |
1213 |
CHECKPOINT(); |
1102 |
|
1214 |
|
Lines 1125-1131
Link Here
|
1125 |
if (m) |
1237 |
if (m) |
1126 |
{ |
1238 |
{ |
1127 |
ullong d = *q; |
1239 |
ullong d = *q; |
1128 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1240 |
__m64 vdest = expand565 ((__m64)d, 0); |
1129 |
vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
1241 |
vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
1130 |
*q = (ullong)vdest; |
1242 |
*q = (ullong)vdest; |
1131 |
} |
1243 |
} |
Lines 1146-1159
Link Here
|
1146 |
|
1258 |
|
1147 |
if ((m0 | m1 | m2 | m3)) |
1259 |
if ((m0 | m1 | m2 | m3)) |
1148 |
{ |
1260 |
{ |
1149 |
Vector4x16 vdest = *(Vector4x16 *)q; |
1261 |
__m64 vdest = *(__m64 *)q; |
1150 |
|
1262 |
|
1151 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); |
1263 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); |
1152 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); |
1264 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); |
1153 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); |
1265 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); |
1154 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); |
1266 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); |
1155 |
|
1267 |
|
1156 |
*(Vector4x16 *)q = vdest; |
1268 |
*(__m64 *)q = vdest; |
1157 |
} |
1269 |
} |
1158 |
twidth -= 4; |
1270 |
twidth -= 4; |
1159 |
p += 4; |
1271 |
p += 4; |
Lines 1168-1174
Link Here
|
1168 |
if (m) |
1280 |
if (m) |
1169 |
{ |
1281 |
{ |
1170 |
ullong d = *q; |
1282 |
ullong d = *q; |
1171 |
Vector4x16 vdest = expand565((Vector4x16)d, 0); |
1283 |
__m64 vdest = expand565((__m64)d, 0); |
1172 |
vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); |
1284 |
vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); |
1173 |
*q = (ullong)vdest; |
1285 |
*q = (ullong)vdest; |
1174 |
} |
1286 |
} |
Lines 1182-1188
Link Here
|
1182 |
dstLine += dstStride; |
1294 |
dstLine += dstStride; |
1183 |
} |
1295 |
} |
1184 |
|
1296 |
|
1185 |
emms (); |
1297 |
_mm_empty (); |
1186 |
} |
1298 |
} |
1187 |
|
1299 |
|
1188 |
void |
1300 |
void |
Lines 1210-1216
Link Here
|
1210 |
|
1322 |
|
1211 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); |
1323 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); |
1212 |
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); |
1324 |
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); |
1213 |
|
1325 |
|
1214 |
while (height--) |
1326 |
while (height--) |
1215 |
{ |
1327 |
{ |
1216 |
dst = dstLine; |
1328 |
dst = dstLine; |
Lines 1218-1224
Link Here
|
1218 |
src = srcLine; |
1330 |
src = srcLine; |
1219 |
srcLine += srcStride; |
1331 |
srcLine += srcStride; |
1220 |
w = width; |
1332 |
w = width; |
1221 |
|
1333 |
|
1222 |
while (w && (unsigned long)dst & 7) |
1334 |
while (w && (unsigned long)dst & 7) |
1223 |
{ |
1335 |
{ |
1224 |
s = *src; |
1336 |
s = *src; |
Lines 1234-1246
Link Here
|
1234 |
|
1346 |
|
1235 |
while (w >= 8) |
1347 |
while (w >= 8) |
1236 |
{ |
1348 |
{ |
1237 |
__asm__ __volatile__ ( |
1349 |
*(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
1238 |
"movq (%0), %%mm2\n\t" |
|
|
1239 |
"movq (%1), %%mm3\n\t" |
1240 |
"paddusb %%mm2, %%mm3\n\t" |
1241 |
"movq %%mm3, (%1)\n\t" |
1242 |
: /* no output */ : "r" (src), "r" (dst)); |
1243 |
|
1244 |
dst += 8; |
1350 |
dst += 8; |
1245 |
src += 8; |
1351 |
src += 8; |
1246 |
w -= 8; |
1352 |
w -= 8; |
Lines 1259-1266
Link Here
|
1259 |
w--; |
1365 |
w--; |
1260 |
} |
1366 |
} |
1261 |
} |
1367 |
} |
1262 |
|
1368 |
|
1263 |
emms(); |
1369 |
_mm_empty(); |
1264 |
} |
1370 |
} |
1265 |
|
1371 |
|
1266 |
void |
1372 |
void |
Lines 1297-1309
Link Here
|
1297 |
|
1403 |
|
1298 |
while (w && (unsigned long)dst & 7) |
1404 |
while (w && (unsigned long)dst & 7) |
1299 |
{ |
1405 |
{ |
1300 |
__asm__ __volatile__ ( |
1406 |
*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
1301 |
"movd %0, %%mm2\n\t" |
1407 |
_mm_cvtsi32_si64(*dst))); |
1302 |
"movd %1, %%mm3\n\t" |
|
|
1303 |
"paddusb %%mm2, %%mm3\n\t" |
1304 |
"movd %%mm3, %1\n\t" |
1305 |
: /* no output */ : "m" (*src), "m" (*dst)); |
1306 |
|
1307 |
dst++; |
1408 |
dst++; |
1308 |
src++; |
1409 |
src++; |
1309 |
w--; |
1410 |
w--; |
Lines 1311-1323
Link Here
|
1311 |
|
1412 |
|
1312 |
while (w >= 2) |
1413 |
while (w >= 2) |
1313 |
{ |
1414 |
{ |
1314 |
__asm__ __volatile__ ( |
1415 |
*(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
1315 |
"movq (%0), %%mm2\n\t" |
|
|
1316 |
"movq (%1), %%mm3\n\t" |
1317 |
"paddusb %%mm2, %%mm3\n\t" |
1318 |
"movq %%mm3, (%1)\n\t" |
1319 |
: /* no output */ : "r" (src), "r" (dst)); |
1320 |
|
1321 |
dst += 2; |
1416 |
dst += 2; |
1322 |
src += 2; |
1417 |
src += 2; |
1323 |
w -= 2; |
1418 |
w -= 2; |
Lines 1325-1340
Link Here
|
1325 |
|
1420 |
|
1326 |
if (w) |
1421 |
if (w) |
1327 |
{ |
1422 |
{ |
1328 |
__asm__ __volatile__ ( |
1423 |
*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
1329 |
"movd %0, %%mm2\n\t" |
1424 |
_mm_cvtsi32_si64(*dst))); |
1330 |
"movd %1, %%mm3\n\t" |
1425 |
|
1331 |
"paddusb %%mm2, %%mm3\n\t" |
|
|
1332 |
"movd %%mm3, %1\n\t" |
1333 |
: /* no output */ : "m" (*src), "m" (*dst)); |
1334 |
} |
1426 |
} |
1335 |
} |
1427 |
} |
1336 |
|
1428 |
|
1337 |
emms(); |
1429 |
_mm_empty(); |
1338 |
} |
1430 |
} |
1339 |
|
1431 |
|
1340 |
#define GetStart(drw,x,y,type,stride,line,bpp) {\ |
1432 |
#define GetStart(drw,x,y,type,stride,line,bpp) {\ |
Lines 1358-1376
Link Here
|
1358 |
FbStride stride; |
1450 |
FbStride stride; |
1359 |
int bpp; |
1451 |
int bpp; |
1360 |
ullong fill; |
1452 |
ullong fill; |
1361 |
Vector8x8 vfill; |
1453 |
__m64 vfill; |
1362 |
CARD32 byte_width; |
1454 |
CARD32 byte_width; |
1363 |
CARD8 *byte_line; |
1455 |
CARD8 *byte_line; |
1364 |
FbBits *bits; |
1456 |
FbBits *bits; |
1365 |
int xoff, yoff; |
1457 |
int xoff, yoff; |
1366 |
|
1458 |
|
1367 |
CHECKPOINT(); |
1459 |
CHECKPOINT(); |
1368 |
|
1460 |
|
1369 |
fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); |
1461 |
fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); |
1370 |
|
1462 |
|
1371 |
if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) |
1463 |
if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) |
1372 |
return FALSE; |
1464 |
return FALSE; |
1373 |
|
1465 |
|
1374 |
if (bpp != 16 && bpp != 32) |
1466 |
if (bpp != 16 && bpp != 32) |
1375 |
return FALSE; |
1467 |
return FALSE; |
1376 |
|
1468 |
|
Lines 1388-1396
Link Here
|
1388 |
byte_width = 4 * width; |
1480 |
byte_width = 4 * width; |
1389 |
stride *= 4; |
1481 |
stride *= 4; |
1390 |
} |
1482 |
} |
1391 |
|
1483 |
|
1392 |
fill = ((ullong)xor << 32) | xor; |
1484 |
fill = ((ullong)xor << 32) | xor; |
1393 |
vfill = (Vector8x8)fill; |
1485 |
vfill = (__m64)fill; |
1394 |
|
1486 |
|
1395 |
while (height--) |
1487 |
while (height--) |
1396 |
{ |
1488 |
{ |
Lines 1398-1404
Link Here
|
1398 |
CARD8 *d = byte_line; |
1490 |
CARD8 *d = byte_line; |
1399 |
byte_line += stride; |
1491 |
byte_line += stride; |
1400 |
w = byte_width; |
1492 |
w = byte_width; |
1401 |
|
1493 |
|
1402 |
while (w >= 2 && ((unsigned long)d & 3)) |
1494 |
while (w >= 2 && ((unsigned long)d & 3)) |
1403 |
{ |
1495 |
{ |
1404 |
*(CARD16 *)d = xor; |
1496 |
*(CARD16 *)d = xor; |
Lines 1406-1440
Link Here
|
1406 |
d += 2; |
1498 |
d += 2; |
1407 |
} |
1499 |
} |
1408 |
|
1500 |
|
1409 |
while (w >= 4 && ((unsigned int)d & 7)) |
1501 |
while (w >= 4 && ((unsigned long)d & 7)) |
1410 |
{ |
1502 |
{ |
1411 |
*(CARD32 *)d = xor; |
1503 |
*(CARD32 *)d = xor; |
1412 |
|
1504 |
|
1413 |
w -= 4; |
1505 |
w -= 4; |
1414 |
d += 4; |
1506 |
d += 4; |
1415 |
} |
1507 |
} |
1416 |
|
1508 |
|
1417 |
while (w >= 64) |
1509 |
while (w >= 64) |
1418 |
{ |
1510 |
{ |
1419 |
__asm__ __volatile ( |
1511 |
*(__m64*) (d + 0) = vfill; |
1420 |
"movq %0, (%1)\n\t" |
1512 |
*(__m64*) (d + 8) = vfill; |
1421 |
"movq %0, 8(%1)\n\t" |
1513 |
*(__m64*) (d + 16) = vfill; |
1422 |
"movq %0, 16(%1)\n\t" |
1514 |
*(__m64*) (d + 24) = vfill; |
1423 |
"movq %0, 24(%1)\n\t" |
1515 |
*(__m64*) (d + 32) = vfill; |
1424 |
"movq %0, 32(%1)\n\t" |
1516 |
*(__m64*) (d + 40) = vfill; |
1425 |
"movq %0, 40(%1)\n\t" |
1517 |
*(__m64*) (d + 48) = vfill; |
1426 |
"movq %0, 48(%1)\n\t" |
1518 |
*(__m64*) (d + 56) = vfill; |
1427 |
"movq %0, 56(%1)\n\t" |
1519 |
|
1428 |
: /* no output */ |
|
|
1429 |
: "y" (vfill), "r" (d) |
1430 |
: "memory"); |
1431 |
w -= 64; |
1520 |
w -= 64; |
1432 |
d += 64; |
1521 |
d += 64; |
1433 |
} |
1522 |
} |
1434 |
while (w >= 4) |
1523 |
while (w >= 4) |
1435 |
{ |
1524 |
{ |
1436 |
*(CARD32 *)d = xor; |
1525 |
*(CARD32 *)d = xor; |
1437 |
|
1526 |
|
1438 |
w -= 4; |
1527 |
w -= 4; |
1439 |
d += 4; |
1528 |
d += 4; |
1440 |
} |
1529 |
} |
Lines 1446-1461
Link Here
|
1446 |
} |
1535 |
} |
1447 |
} |
1536 |
} |
1448 |
|
1537 |
|
1449 |
emms(); |
1538 |
_mm_empty(); |
|
|
1539 |
return TRUE; |
1540 |
} |
1541 |
|
1542 |
Bool |
1543 |
fbCopyAreammx (DrawablePtr pSrc, |
1544 |
DrawablePtr pDst, |
1545 |
int src_x, |
1546 |
int src_y, |
1547 |
int dst_x, |
1548 |
int dst_y, |
1549 |
int width, |
1550 |
int height) |
1551 |
{ |
1552 |
FbBits * src_bits; |
1553 |
FbStride src_stride; |
1554 |
int src_bpp; |
1555 |
int src_xoff; |
1556 |
int src_yoff; |
1557 |
|
1558 |
FbBits * dst_bits; |
1559 |
FbStride dst_stride; |
1560 |
int dst_bpp; |
1561 |
int dst_xoff; |
1562 |
int dst_yoff; |
1563 |
|
1564 |
CARD8 * src_bytes; |
1565 |
CARD8 * dst_bytes; |
1566 |
int byte_width; |
1567 |
|
1568 |
fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); |
1569 |
fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); |
1570 |
|
1571 |
if (src_bpp != 16 && src_bpp != 32) |
1572 |
return FALSE; |
1573 |
|
1574 |
if (dst_bpp != 16 && dst_bpp != 32) |
1575 |
return FALSE; |
1576 |
|
1577 |
if (src_bpp != dst_bpp) |
1578 |
{ |
1579 |
return FALSE; |
1580 |
} |
1581 |
|
1582 |
if (src_bpp == 16) |
1583 |
{ |
1584 |
src_stride = src_stride * sizeof (FbBits) / 2; |
1585 |
dst_stride = dst_stride * sizeof (FbBits) / 2; |
1586 |
src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
1587 |
dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
1588 |
byte_width = 2 * width; |
1589 |
src_stride *= 2; |
1590 |
dst_stride *= 2; |
1591 |
} |
1592 |
else |
1593 |
{ |
1594 |
src_stride = src_stride * sizeof (FbBits) / 4; |
1595 |
dst_stride = dst_stride * sizeof (FbBits) / 4; |
1596 |
src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
1597 |
dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
1598 |
byte_width = 4 * width; |
1599 |
src_stride *= 4; |
1600 |
dst_stride *= 4; |
1601 |
} |
1602 |
|
1603 |
while (height--) |
1604 |
{ |
1605 |
int w; |
1606 |
CARD8 *s = src_bytes; |
1607 |
CARD8 *d = dst_bytes; |
1608 |
src_bytes += src_stride; |
1609 |
dst_bytes += dst_stride; |
1610 |
w = byte_width; |
1611 |
|
1612 |
while (w >= 2 && ((unsigned long)d & 3)) |
1613 |
{ |
1614 |
*(CARD16 *)d = *(CARD16 *)s; |
1615 |
w -= 2; |
1616 |
s += 2; |
1617 |
d += 2; |
1618 |
} |
1619 |
|
1620 |
while (w >= 4 && ((unsigned int)d & 7)) |
1621 |
{ |
1622 |
*(CARD32 *)d = *(CARD32 *)s; |
1623 |
|
1624 |
w -= 4; |
1625 |
s += 4; |
1626 |
d += 4; |
1627 |
} |
1628 |
|
1629 |
while (w >= 64) |
1630 |
{ |
1631 |
*(__m64 *)(d + 0) = *(__m64 *)(s + 0); |
1632 |
*(__m64 *)(d + 8) = *(__m64 *)(s + 8); |
1633 |
*(__m64 *)(d + 16) = *(__m64 *)(s + 16); |
1634 |
*(__m64 *)(d + 24) = *(__m64 *)(s + 24); |
1635 |
*(__m64 *)(d + 32) = *(__m64 *)(s + 32); |
1636 |
*(__m64 *)(d + 40) = *(__m64 *)(s + 40); |
1637 |
*(__m64 *)(d + 48) = *(__m64 *)(s + 48); |
1638 |
*(__m64 *)(d + 56) = *(__m64 *)(s + 56); |
1639 |
w -= 64; |
1640 |
s += 64; |
1641 |
d += 64; |
1642 |
} |
1643 |
while (w >= 4) |
1644 |
{ |
1645 |
*(CARD32 *)d = *(CARD32 *)s; |
1646 |
|
1647 |
w -= 4; |
1648 |
s += 4; |
1649 |
d += 4; |
1650 |
} |
1651 |
if (w >= 2) |
1652 |
{ |
1653 |
*(CARD16 *)d = *(CARD16 *)s; |
1654 |
w -= 2; |
1655 |
s += 2; |
1656 |
d += 2; |
1657 |
} |
1658 |
} |
1659 |
|
1660 |
_mm_empty(); |
1450 |
return TRUE; |
1661 |
return TRUE; |
1451 |
} |
1662 |
} |
1452 |
|
1663 |
|
|
|
1664 |
void |
1665 |
fbCompositeCopyAreammx (CARD8 op, |
1666 |
PicturePtr pSrc, |
1667 |
PicturePtr pMask, |
1668 |
PicturePtr pDst, |
1669 |
INT16 xSrc, |
1670 |
INT16 ySrc, |
1671 |
INT16 xMask, |
1672 |
INT16 yMask, |
1673 |
INT16 xDst, |
1674 |
INT16 yDst, |
1675 |
CARD16 width, |
1676 |
CARD16 height) |
1677 |
{ |
1678 |
fbCopyAreammx (pSrc->pDrawable, |
1679 |
pDst->pDrawable, |
1680 |
xSrc, ySrc, |
1681 |
xDst, yDst, |
1682 |
width, height); |
1683 |
} |
1684 |
|
1685 |
#ifndef __amd64__ |
1453 |
Bool |
1686 |
Bool |
1454 |
fbHaveMMX (void) |
1687 |
fbHaveMMX (void) |
1455 |
{ |
1688 |
{ |
1456 |
static Bool initialized = FALSE; |
1689 |
static Bool initialized = FALSE; |
1457 |
static Bool mmx_present; |
1690 |
static Bool mmx_present; |
1458 |
|
1691 |
|
1459 |
if (!initialized) |
1692 |
if (!initialized) |
1460 |
{ |
1693 |
{ |
1461 |
int tmp; /* static variables are accessed through %ebx, |
1694 |
int tmp; /* static variables are accessed through %ebx, |
Lines 1466-1472
Link Here
|
1466 |
|
1699 |
|
1467 |
__asm__ __volatile__ ( |
1700 |
__asm__ __volatile__ ( |
1468 |
/* Check if bit 21 in flags word is writeable */ |
1701 |
/* Check if bit 21 in flags word is writeable */ |
1469 |
|
1702 |
|
1470 |
"pusha \n\t" |
1703 |
"pusha \n\t" |
1471 |
"pushfl \n\t" |
1704 |
"pushfl \n\t" |
1472 |
"popl %%eax \n\t" |
1705 |
"popl %%eax \n\t" |
Lines 1502-1514
Link Here
|
1502 |
: /* no input */); |
1735 |
: /* no input */); |
1503 |
|
1736 |
|
1504 |
initialized = TRUE; |
1737 |
initialized = TRUE; |
1505 |
|
1738 |
|
1506 |
mmx_present = tmp; |
1739 |
mmx_present = tmp; |
1507 |
} |
1740 |
} |
1508 |
|
1741 |
|
1509 |
return mmx_present; |
1742 |
return mmx_present; |
1510 |
} |
1743 |
} |
|
|
1744 |
#endif /* __amd64__ */ |
1511 |
|
1745 |
|
1512 |
|
1746 |
|
1513 |
#endif /* RENDER */ |
1747 |
#endif /* RENDER */ |
1514 |
#endif /* USE_GCC34_MMX */ |
1748 |
#endif /* USE_MMX */ |