Lines 1-5
Link Here
|
1 |
/* |
1 |
/* |
2 |
* Copyright © 2004 Red Hat, Inc. |
2 |
* Copyright © 2004 Red Hat, Inc. |
|
|
3 |
* Copyright © 2004 Nicholas Miell |
3 |
* |
4 |
* |
4 |
* Permission to use, copy, modify, distribute, and sell this software and its |
5 |
* Permission to use, copy, modify, distribute, and sell this software and its |
5 |
* documentation for any purpose is hereby granted without fee, provided that |
6 |
* documentation for any purpose is hereby granted without fee, provided that |
Lines 18-31
Link Here
|
18 |
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
19 |
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN |
19 |
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
20 |
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
20 |
* |
21 |
* |
21 |
* Author: Søren Sandmann (sandmann@redhat.com) |
22 |
* Author: Søren Sandmann (sandmann@redhat.com) |
22 |
* |
23 |
* Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
|
|
24 |
* |
23 |
* Based on work by Owen Taylor |
25 |
* Based on work by Owen Taylor |
24 |
*/ |
26 |
*/ |
25 |
|
27 |
|
26 |
#include "fb.h" |
|
|
27 |
|
28 |
|
28 |
#ifdef USE_GCC34_MMX |
29 |
#ifdef USE_GCC34_MMX |
|
|
30 |
#else |
31 |
#error "Kala" |
32 |
#include "fb.h" |
33 |
#include "fbmmx.h" |
34 |
|
35 |
#include <mmintrin.h> |
36 |
|
37 |
#ifdef USE_SSE |
38 |
#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
39 |
#endif |
29 |
|
40 |
|
30 |
#ifdef RENDER |
41 |
#ifdef RENDER |
31 |
|
42 |
|
Lines 33-43
Link Here
|
33 |
#include "mipict.h" |
44 |
#include "mipict.h" |
34 |
#include "fbpict.h" |
45 |
#include "fbpict.h" |
35 |
|
46 |
|
36 |
typedef int Vector1x64 __attribute__ ((mode(DI))); |
|
|
37 |
typedef int Vector2x32 __attribute__ ((mode(V2SI))); |
38 |
typedef int Vector4x16 __attribute__ ((mode(V4HI))); |
39 |
typedef int Vector8x8 __attribute__ ((mode(V8QI))); |
40 |
|
41 |
typedef unsigned long long ullong; |
47 |
typedef unsigned long long ullong; |
42 |
|
48 |
|
43 |
#define noVERBOSE |
49 |
#define noVERBOSE |
Lines 50-56
Link Here
|
50 |
|
56 |
|
51 |
typedef struct |
57 |
typedef struct |
52 |
{ |
58 |
{ |
53 |
ullong mmx_zero; |
|
|
54 |
ullong mmx_4x00ff; |
59 |
ullong mmx_4x00ff; |
55 |
ullong mmx_4x0080; |
60 |
ullong mmx_4x0080; |
56 |
ullong mmx_565_rgb; |
61 |
ullong mmx_565_rgb; |
Lines 70-76
Link Here
|
70 |
|
75 |
|
71 |
static const MMXData c = |
76 |
static const MMXData c = |
72 |
{ |
77 |
{ |
73 |
.mmx_zero = 0x0000000000000000ULL, |
|
|
74 |
.mmx_4x00ff = 0x00ff00ff00ff00ffULL, |
78 |
.mmx_4x00ff = 0x00ff00ff00ff00ffULL, |
75 |
.mmx_4x0080 = 0x0080008000800080ULL, |
79 |
.mmx_4x0080 = 0x0080008000800080ULL, |
76 |
.mmx_565_rgb = 0x000001f0003f001fULL, |
80 |
.mmx_565_rgb = 0x000001f0003f001fULL, |
Lines 88-208
Link Here
|
88 |
.mmx_000000000000ffff = 0x000000000000ffffULL, |
92 |
.mmx_000000000000ffff = 0x000000000000ffffULL, |
89 |
}; |
93 |
}; |
90 |
|
94 |
|
91 |
static __inline__ Vector1x64 |
95 |
#define MC(x) ((__m64) c.mmx_##x) |
92 |
shift (Vector1x64 v, int s) |
96 |
|
|
|
97 |
static __inline__ __m64 |
98 |
shift (__m64 v, int s) |
93 |
{ |
99 |
{ |
94 |
if (s > 0) |
100 |
if (s > 0) |
95 |
return __builtin_ia32_psllq (v, s); |
101 |
return _mm_slli_si64 (v, s); |
96 |
else if (s < 0) |
102 |
else if (s < 0) |
97 |
return __builtin_ia32_psrlq (v, -s); |
103 |
return _mm_srli_si64 (v, -s); |
98 |
else |
104 |
else |
99 |
return v; |
105 |
return v; |
100 |
} |
106 |
} |
101 |
|
107 |
|
102 |
static __inline__ Vector4x16 |
108 |
static __inline__ __m64 |
103 |
negate (Vector4x16 mask) |
109 |
negate (__m64 mask) |
104 |
{ |
110 |
{ |
105 |
return (Vector4x16)__builtin_ia32_pxor ( |
111 |
return _mm_xor_si64 (mask, MC(4x00ff)); |
106 |
(Vector1x64)mask, |
|
|
107 |
(Vector1x64)c.mmx_4x00ff); |
108 |
} |
112 |
} |
109 |
|
113 |
|
110 |
static __inline__ Vector4x16 |
114 |
static __inline__ __m64 |
111 |
pix_multiply (Vector4x16 a, Vector4x16 b) |
115 |
pix_multiply (__m64 a, __m64 b) |
112 |
{ |
116 |
{ |
113 |
Vector4x16 res; |
117 |
__m64 res; |
114 |
|
118 |
|
115 |
res = __builtin_ia32_pmullw (a, b); |
119 |
res = _mm_mullo_pi16 (a, b); |
116 |
res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080); |
120 |
res = _mm_add_pi16 (res, MC(4x0080)); |
117 |
res = __builtin_ia32_psrlw (res, 8); |
121 |
res = _mm_srli_pi16 (res, 8); |
118 |
|
122 |
|
119 |
return res; |
123 |
return res; |
120 |
} |
124 |
} |
121 |
|
125 |
|
122 |
#if 0 |
126 |
#ifdef USE_SSE |
123 |
#define HAVE_PSHUFW |
127 |
#define HAVE_PSHUFW |
124 |
#endif |
128 |
#endif |
125 |
|
129 |
|
126 |
#ifdef HAVE_PSHUFW |
130 |
#ifdef HAVE_PSHUFW |
127 |
|
131 |
|
128 |
static __inline__ Vector4x16 |
132 |
static __inline__ __m64 |
129 |
expand_alpha (Vector4x16 pixel) |
133 |
expand_alpha (__m64 pixel) |
130 |
{ |
134 |
{ |
131 |
Vector4x16 result; |
135 |
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); |
132 |
__asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
|
|
133 |
return result; |
134 |
} |
136 |
} |
135 |
|
137 |
|
136 |
static __inline__ Vector4x16 |
138 |
static __inline__ __m64 |
137 |
expand_alpha_rev (Vector4x16 pixel) |
139 |
expand_alpha_rev (__m64 pixel) |
138 |
{ |
140 |
{ |
139 |
Vector4x16 result; |
141 |
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); |
140 |
__asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
|
|
141 |
return result; |
142 |
} |
142 |
} |
143 |
|
143 |
|
144 |
static __inline__ Vector4x16 |
144 |
static __inline__ __m64 |
145 |
invert_colors (Vector4x16 pixel) |
145 |
invert_colors (__m64 pixel) |
146 |
{ |
146 |
{ |
147 |
Vector4x16 result; |
147 |
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); |
148 |
|
|
|
149 |
/* 0xC6 = 11000110 */ |
150 |
/* 3 0 1 2 */ |
151 |
|
152 |
__asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel)); |
153 |
|
154 |
return result; |
155 |
} |
148 |
} |
156 |
|
149 |
|
157 |
#else |
150 |
#else |
158 |
|
151 |
|
159 |
static __inline__ Vector4x16 |
152 |
static __inline__ __m64 |
160 |
expand_alpha (Vector4x16 pixel) |
153 |
expand_alpha (__m64 pixel) |
161 |
{ |
154 |
{ |
162 |
Vector1x64 t1, t2; |
155 |
__m64 t1, t2; |
163 |
|
156 |
|
164 |
t1 = shift ((Vector1x64)pixel, -48); |
157 |
t1 = shift (pixel, -48); |
165 |
t2 = shift (t1, 16); |
158 |
t2 = shift (t1, 16); |
166 |
t1 = __builtin_ia32_por (t1, t2); |
159 |
t1 = _mm_or_si64 (t1, t2); |
167 |
t2 = shift (t1, 32); |
160 |
t2 = shift (t1, 32); |
168 |
t1 = __builtin_ia32_por (t1, t2); |
161 |
t1 = _mm_or_si64 (t1, t2); |
169 |
|
162 |
|
170 |
return (Vector4x16)t1; |
163 |
return t1; |
171 |
} |
164 |
} |
172 |
|
165 |
|
173 |
static __inline__ Vector4x16 |
166 |
static __inline__ __m64 |
174 |
expand_alpha_rev (Vector4x16 pixel) |
167 |
expand_alpha_rev (__m64 pixel) |
175 |
{ |
168 |
{ |
176 |
Vector1x64 t1, t2; |
169 |
__m64 t1, t2; |
177 |
|
170 |
|
178 |
t1 = shift ((Vector1x64)pixel, 48); |
171 |
/* move alpha to low 16 bits and zero the rest */ |
|
|
172 |
t1 = shift (pixel, 48); |
179 |
t1 = shift (t1, -48); |
173 |
t1 = shift (t1, -48); |
|
|
174 |
|
180 |
t2 = shift (t1, 16); |
175 |
t2 = shift (t1, 16); |
181 |
t1 = __builtin_ia32_por (t1, t2); |
176 |
t1 = _mm_or_si64 (t1, t2); |
182 |
t2 = shift (t1, 32); |
177 |
t2 = shift (t1, 32); |
183 |
t1 = __builtin_ia32_por (t1, t2); |
178 |
t1 = _mm_or_si64 (t1, t2); |
184 |
|
179 |
|
185 |
return (Vector4x16)t1; |
180 |
return t1; |
186 |
} |
181 |
} |
187 |
|
182 |
|
188 |
static __inline__ Vector4x16 |
183 |
static __inline__ __m64 |
189 |
invert_colors (Vector4x16 pixel) |
184 |
invert_colors (__m64 pixel) |
190 |
{ |
185 |
{ |
191 |
Vector1x64 x, y, z; |
186 |
__m64 x, y, z; |
192 |
|
187 |
|
193 |
x = y = z = (Vector1x64)pixel; |
188 |
x = y = z = pixel; |
194 |
|
189 |
|
195 |
x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000); |
190 |
x = _mm_and_si64 (x, MC(ffff0000ffff0000)); |
196 |
y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff); |
191 |
y = _mm_and_si64 (y, MC(000000000000ffff)); |
197 |
z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000); |
192 |
z = _mm_and_si64 (z, MC(0000ffff00000000)); |
198 |
|
193 |
|
199 |
y = shift (y, 32); |
194 |
y = shift (y, 32); |
200 |
z = shift (z, -32); |
195 |
z = shift (z, -32); |
201 |
|
196 |
|
202 |
x = __builtin_ia32_por (x, y); |
197 |
x = _mm_or_si64 (x, y); |
203 |
x = __builtin_ia32_por (x, z); |
198 |
x = _mm_or_si64 (x, z); |
204 |
|
199 |
|
205 |
return (Vector4x16)x; |
200 |
return x; |
206 |
} |
201 |
} |
207 |
|
202 |
|
208 |
#endif |
203 |
#endif |
Lines 210-356
Link Here
|
210 |
/* Notes about writing mmx code |
205 |
/* Notes about writing mmx code |
211 |
* |
206 |
* |
212 |
* give memory operands as the second operand. If you give it as the |
207 |
* give memory operands as the second operand. If you give it as the |
213 |
* first, gcc will first load it into a register, then use that register |
208 |
* first, gcc will first load it into a register, then use that |
|
|
209 |
* register |
214 |
* |
210 |
* |
215 |
* ie. use |
211 |
* ie. use |
216 |
* |
212 |
* |
217 |
* __builtin_pmullw (x, mmx_constant[8]); |
213 |
* _mm_mullo_pi16 (x, mmx_constant); |
218 |
* |
214 |
* |
219 |
* not |
215 |
* not |
220 |
* |
216 |
* |
221 |
* __builtin_pmullw (mmx_constant[8], x); |
217 |
* _mm_mullo_pi16 (mmx_constant, x); |
222 |
* |
218 |
* |
223 |
* Also try to minimize dependencies. Ie. when you need a value, try to calculate |
219 |
* Also try to minimize dependencies. i.e. when you need a value, try |
224 |
* it from a value that was calculated as early as possible. |
220 |
* to calculate it from a value that was calculated as early as |
|
|
221 |
* possible. |
225 |
*/ |
222 |
*/ |
226 |
|
223 |
|
227 |
static __inline__ Vector4x16 |
224 |
static __inline__ __m64 |
228 |
over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest) |
225 |
over (__m64 src, __m64 srca, __m64 dest) |
229 |
{ |
226 |
{ |
230 |
return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca))); |
227 |
return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); |
231 |
} |
228 |
} |
232 |
|
229 |
|
233 |
static __inline__ Vector4x16 |
230 |
static __inline__ __m64 |
234 |
over_rev_non_pre (Vector4x16 src, Vector4x16 dest) |
231 |
over_rev_non_pre (__m64 src, __m64 dest) |
235 |
{ |
232 |
{ |
236 |
Vector4x16 srca = expand_alpha (src); |
233 |
__m64 srca = expand_alpha (src); |
237 |
Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha); |
234 |
__m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); |
238 |
|
235 |
|
239 |
return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); |
236 |
return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); |
240 |
} |
237 |
} |
241 |
|
238 |
|
242 |
static __inline__ Vector4x16 |
239 |
static __inline__ __m64 |
243 |
in (Vector4x16 src, |
240 |
in (__m64 src, |
244 |
Vector4x16 mask) |
241 |
__m64 mask) |
245 |
{ |
242 |
{ |
246 |
return pix_multiply (src, mask); |
243 |
return pix_multiply (src, mask); |
247 |
} |
244 |
} |
248 |
|
245 |
|
249 |
static __inline__ Vector4x16 |
246 |
static __inline__ __m64 |
250 |
in_over (Vector4x16 src, |
247 |
in_over (__m64 src, |
251 |
Vector4x16 srca, |
248 |
__m64 srca, |
252 |
Vector4x16 mask, |
249 |
__m64 mask, |
253 |
Vector4x16 dest) |
250 |
__m64 dest) |
254 |
{ |
251 |
{ |
255 |
return over(in(src, mask), pix_multiply(srca, mask), dest); |
252 |
return over(in(src, mask), pix_multiply(srca, mask), dest); |
256 |
} |
253 |
} |
257 |
|
254 |
|
258 |
static __inline__ Vector8x8 |
255 |
static __inline__ __m64 |
259 |
cvt32to64 (CARD32 v) |
|
|
260 |
{ |
261 |
ullong r = v; |
262 |
return (Vector8x8)r; |
263 |
} |
264 |
|
265 |
static __inline__ Vector4x16 |
266 |
load8888 (CARD32 v) |
256 |
load8888 (CARD32 v) |
267 |
{ |
257 |
{ |
268 |
return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v), |
258 |
return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); |
269 |
(Vector8x8)c.mmx_zero); |
|
|
270 |
} |
259 |
} |
271 |
|
260 |
|
272 |
static __inline__ Vector8x8 |
261 |
static __inline__ __m64 |
273 |
pack8888 (Vector4x16 lo, Vector4x16 hi) |
262 |
pack8888 (__m64 lo, __m64 hi) |
274 |
{ |
263 |
{ |
275 |
Vector8x8 r; |
264 |
__m64 r; |
276 |
r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi); |
265 |
r = _mm_packs_pu16 (lo, hi); |
277 |
return r; |
266 |
return r; |
278 |
} |
267 |
} |
279 |
|
268 |
|
280 |
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB |
269 |
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
281 |
|
270 |
* |
282 |
--- Expanding 565 in the low word --- |
271 |
* 00RR00GG00BB |
283 |
|
272 |
* |
284 |
m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
273 |
* --- Expanding 565 in the low word --- |
285 |
m = m & (01f0003f001f); |
274 |
* |
286 |
m = m * (008404100840); |
275 |
* m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
287 |
m = m >> 8; |
276 |
* m = m & (01f0003f001f); |
288 |
|
277 |
* m = m * (008404100840); |
289 |
Note the trick here - the top word is shifted by another nibble to avoid |
278 |
* m = m >> 8; |
290 |
it bumping into the middle word |
279 |
* |
291 |
*/ |
280 |
* Note the trick here - the top word is shifted by another nibble to |
292 |
static __inline__ Vector4x16 |
281 |
* avoid it bumping into the middle word |
293 |
expand565 (Vector4x16 pixel, int pos) |
282 |
*/ |
|
|
283 |
static __inline__ __m64 |
284 |
expand565 (__m64 pixel, int pos) |
294 |
{ |
285 |
{ |
295 |
Vector1x64 p = (Vector1x64)pixel; |
286 |
__m64 p = pixel; |
|
|
287 |
__m64 t1, t2; |
296 |
|
288 |
|
297 |
/* move pixel to low 16 bit and zero the rest */ |
289 |
/* move pixel to low 16 bit and zero the rest */ |
298 |
p = shift (shift (p, (3 - pos) * 16), -48); |
290 |
p = shift (shift (p, (3 - pos) * 16), -48); |
299 |
|
291 |
|
300 |
Vector1x64 t1 = shift (p, 36 - 11); |
292 |
t1 = shift (p, 36 - 11); |
301 |
Vector1x64 t2 = shift (p, 16 - 5); |
293 |
t2 = shift (p, 16 - 5); |
302 |
|
294 |
|
303 |
p = __builtin_ia32_por (t1, p); |
295 |
p = _mm_or_si64 (t1, p); |
304 |
p = __builtin_ia32_por (t2, p); |
296 |
p = _mm_or_si64 (t2, p); |
305 |
p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb); |
297 |
p = _mm_and_si64 (p, MC(565_rgb)); |
306 |
|
298 |
|
307 |
pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier); |
299 |
pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); |
308 |
return __builtin_ia32_psrlw (pixel, 8); |
300 |
return _mm_srli_pi16 (pixel, 8); |
309 |
} |
301 |
} |
310 |
|
302 |
|
311 |
static __inline__ Vector4x16 |
303 |
static __inline__ __m64 |
312 |
expand8888 (Vector4x16 in, int pos) |
304 |
expand8888 (__m64 in, int pos) |
313 |
{ |
305 |
{ |
314 |
if (pos == 0) |
306 |
if (pos == 0) |
315 |
return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); |
307 |
return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); |
316 |
else |
308 |
else |
317 |
return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); |
309 |
return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); |
318 |
} |
310 |
} |
319 |
|
311 |
|
320 |
static __inline__ Vector4x16 |
312 |
static __inline__ __m64 |
321 |
pack565 (Vector4x16 pixel, Vector4x16 target, int pos) |
313 |
pack565 (__m64 pixel, __m64 target, int pos) |
322 |
{ |
314 |
{ |
323 |
Vector1x64 p = (Vector1x64)pixel; |
315 |
__m64 p = pixel; |
324 |
Vector1x64 t = (Vector1x64)target; |
316 |
__m64 t = target; |
325 |
Vector1x64 r, g, b; |
317 |
__m64 r, g, b; |
326 |
|
318 |
|
327 |
r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r); |
319 |
r = _mm_and_si64 (p, MC(565_r)); |
328 |
g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g); |
320 |
g = _mm_and_si64 (p, MC(565_g)); |
329 |
b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b); |
321 |
b = _mm_and_si64 (p, MC(565_b)); |
330 |
|
322 |
|
331 |
r = shift (r, - (32 - 8) + pos * 16); |
323 |
r = shift (r, - (32 - 8) + pos * 16); |
332 |
g = shift (g, - (16 - 3) + pos * 16); |
324 |
g = shift (g, - (16 - 3) + pos * 16); |
333 |
b = shift (b, - (0 + 3) + pos * 16); |
325 |
b = shift (b, - (0 + 3) + pos * 16); |
334 |
|
326 |
|
335 |
if (pos == 0) |
327 |
if (pos == 0) |
336 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0); |
328 |
t = _mm_and_si64 (t, MC(mask_0)); |
337 |
else if (pos == 1) |
329 |
else if (pos == 1) |
338 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1); |
330 |
t = _mm_and_si64 (t, MC(mask_1)); |
339 |
else if (pos == 2) |
331 |
else if (pos == 2) |
340 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2); |
332 |
t = _mm_and_si64 (t, MC(mask_2)); |
341 |
else if (pos == 3) |
333 |
else if (pos == 3) |
342 |
t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3); |
334 |
t = _mm_and_si64 (t, MC(mask_3)); |
343 |
|
335 |
|
344 |
p = __builtin_ia32_por (r, t); |
336 |
p = _mm_or_si64 (r, t); |
345 |
p = __builtin_ia32_por (g, p); |
337 |
p = _mm_or_si64 (g, p); |
346 |
|
338 |
|
347 |
return (Vector4x16)__builtin_ia32_por (b, p); |
339 |
return _mm_or_si64 (b, p); |
348 |
} |
|
|
349 |
|
350 |
static __inline__ void |
351 |
emms (void) |
352 |
{ |
353 |
__asm__ __volatile__ ("emms"); |
354 |
} |
340 |
} |
355 |
|
341 |
|
356 |
void |
342 |
void |
Lines 371-378
Link Here
|
371 |
CARD32 *dstLine, *dst; |
357 |
CARD32 *dstLine, *dst; |
372 |
CARD16 w; |
358 |
CARD16 w; |
373 |
FbStride dstStride; |
359 |
FbStride dstStride; |
374 |
Vector4x16 vsrc, vsrca; |
360 |
__m64 vsrc, vsrca; |
375 |
|
361 |
|
376 |
CHECKPOINT(); |
362 |
CHECKPOINT(); |
377 |
|
363 |
|
378 |
fbComposeGetSolid(pSrc, src, pDst->format); |
364 |
fbComposeGetSolid(pSrc, src, pDst->format); |
Lines 384-434
Link Here
|
384 |
|
370 |
|
385 |
vsrc = load8888 (src); |
371 |
vsrc = load8888 (src); |
386 |
vsrca = expand_alpha (vsrc); |
372 |
vsrca = expand_alpha (vsrc); |
387 |
|
373 |
|
388 |
while (height--) |
374 |
while (height--) |
389 |
{ |
375 |
{ |
390 |
dst = dstLine; |
376 |
dst = dstLine; |
391 |
dstLine += dstStride; |
377 |
dstLine += dstStride; |
392 |
w = width; |
378 |
w = width; |
393 |
|
379 |
|
394 |
CHECKPOINT(); |
380 |
CHECKPOINT(); |
395 |
|
381 |
|
396 |
while (w && (unsigned long)dst & 7) |
382 |
while (w && (unsigned long)dst & 7) |
397 |
{ |
383 |
{ |
398 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); |
384 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), |
|
|
385 |
_mm_setzero_si64()); |
399 |
|
386 |
|
400 |
w--; |
387 |
w--; |
401 |
dst++; |
388 |
dst++; |
402 |
} |
389 |
} |
403 |
|
390 |
|
404 |
while (w >= 2) |
391 |
while (w >= 2) |
405 |
{ |
392 |
{ |
406 |
Vector4x16 vdest; |
393 |
__m64 vdest; |
407 |
Vector4x16 dest0, dest1; |
394 |
__m64 dest0, dest1; |
408 |
|
395 |
|
409 |
vdest = *(Vector4x16 *)dst; |
396 |
vdest = *(__m64 *)dst; |
410 |
|
397 |
|
411 |
dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); |
398 |
dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); |
412 |
dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); |
399 |
dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); |
413 |
|
400 |
|
414 |
*(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); |
401 |
*(__m64 *)dst = pack8888(dest0, dest1); |
415 |
|
402 |
|
416 |
dst += 2; |
403 |
dst += 2; |
417 |
w -= 2; |
404 |
w -= 2; |
418 |
} |
405 |
} |
419 |
|
406 |
|
420 |
CHECKPOINT(); |
407 |
CHECKPOINT(); |
421 |
|
408 |
|
422 |
while (w) |
409 |
while (w) |
423 |
{ |
410 |
{ |
424 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); |
411 |
*dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64()); |
425 |
|
412 |
|
426 |
w--; |
413 |
w--; |
427 |
dst++; |
414 |
dst++; |
428 |
} |
415 |
} |
429 |
} |
416 |
} |
430 |
|
417 |
|
431 |
emms(); |
418 |
_mm_empty(); |
432 |
} |
419 |
} |
433 |
|
420 |
|
434 |
void |
421 |
void |
Lines 449-456
Link Here
|
449 |
CARD16 *dstLine, *dst; |
436 |
CARD16 *dstLine, *dst; |
450 |
CARD16 w; |
437 |
CARD16 w; |
451 |
FbStride dstStride; |
438 |
FbStride dstStride; |
452 |
Vector4x16 vsrc, vsrca; |
439 |
__m64 vsrc, vsrca; |
453 |
|
440 |
|
454 |
CHECKPOINT(); |
441 |
CHECKPOINT(); |
455 |
|
442 |
|
456 |
fbComposeGetSolid(pSrc, src, pDst->format); |
443 |
fbComposeGetSolid(pSrc, src, pDst->format); |
Lines 462-510
Link Here
|
462 |
|
449 |
|
463 |
vsrc = load8888 (src); |
450 |
vsrc = load8888 (src); |
464 |
vsrca = expand_alpha (vsrc); |
451 |
vsrca = expand_alpha (vsrc); |
465 |
|
452 |
|
466 |
while (height--) |
453 |
while (height--) |
467 |
{ |
454 |
{ |
468 |
dst = dstLine; |
455 |
dst = dstLine; |
469 |
dstLine += dstStride; |
456 |
dstLine += dstStride; |
470 |
w = width; |
457 |
w = width; |
471 |
|
458 |
|
472 |
CHECKPOINT(); |
459 |
CHECKPOINT(); |
473 |
|
460 |
|
474 |
while (w && (unsigned long)dst & 7) |
461 |
while (w && (unsigned long)dst & 7) |
475 |
{ |
462 |
{ |
476 |
ullong d = *dst; |
463 |
ullong d = *dst; |
477 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
464 |
__m64 vdest = expand565 ((__m64)d, 0); |
478 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
465 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
479 |
*dst = (ullong)vdest; |
466 |
*dst = (ullong)vdest; |
480 |
|
467 |
|
481 |
w--; |
468 |
w--; |
482 |
dst++; |
469 |
dst++; |
483 |
} |
470 |
} |
484 |
|
471 |
|
485 |
while (w >= 4) |
472 |
while (w >= 4) |
486 |
{ |
473 |
{ |
487 |
Vector4x16 vdest; |
474 |
__m64 vdest; |
488 |
|
475 |
|
489 |
vdest = *(Vector4x16 *)dst; |
476 |
vdest = *(__m64 *)dst; |
490 |
|
477 |
|
491 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); |
478 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); |
492 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); |
479 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); |
493 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); |
480 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); |
494 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); |
481 |
vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); |
495 |
|
482 |
|
496 |
*(Vector8x8 *)dst = (Vector8x8)vdest; |
483 |
*(__m64 *)dst = vdest; |
497 |
|
484 |
|
498 |
dst += 4; |
485 |
dst += 4; |
499 |
w -= 4; |
486 |
w -= 4; |
500 |
} |
487 |
} |
501 |
|
488 |
|
502 |
CHECKPOINT(); |
489 |
CHECKPOINT(); |
503 |
|
490 |
|
504 |
while (w) |
491 |
while (w) |
505 |
{ |
492 |
{ |
506 |
ullong d = *dst; |
493 |
ullong d = *dst; |
507 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
494 |
__m64 vdest = expand565 ((__m64)d, 0); |
508 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
495 |
vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
509 |
*dst = (ullong)vdest; |
496 |
*dst = (ullong)vdest; |
510 |
|
497 |
|
Lines 513-519
Link Here
|
513 |
} |
500 |
} |
514 |
} |
501 |
} |
515 |
|
502 |
|
516 |
emms(); |
503 |
_mm_empty(); |
517 |
} |
504 |
} |
518 |
|
505 |
|
519 |
void |
506 |
void |
Lines 534-541
Link Here
|
534 |
CARD32 *dstLine; |
521 |
CARD32 *dstLine; |
535 |
CARD32 *maskLine; |
522 |
CARD32 *maskLine; |
536 |
FbStride dstStride, maskStride; |
523 |
FbStride dstStride, maskStride; |
537 |
Vector4x16 vsrc, vsrca; |
524 |
__m64 vsrc, vsrca; |
538 |
|
525 |
|
539 |
CHECKPOINT(); |
526 |
CHECKPOINT(); |
540 |
|
527 |
|
541 |
fbComposeGetSolid(pSrc, src, pDst->format); |
528 |
fbComposeGetSolid(pSrc, src, pDst->format); |
Lines 562-570
Link Here
|
562 |
|
549 |
|
563 |
if (m) |
550 |
if (m) |
564 |
{ |
551 |
{ |
565 |
Vector4x16 vdest = load8888(*q); |
552 |
__m64 vdest = load8888(*q); |
566 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
553 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
567 |
*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
554 |
*q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
568 |
} |
555 |
} |
569 |
|
556 |
|
570 |
twidth--; |
557 |
twidth--; |
Lines 580-594
Link Here
|
580 |
|
567 |
|
581 |
if (m0 | m1) |
568 |
if (m0 | m1) |
582 |
{ |
569 |
{ |
583 |
Vector4x16 dest0, dest1; |
570 |
__m64 dest0, dest1; |
584 |
Vector4x16 vdest = *(Vector4x16 *)q; |
571 |
__m64 vdest = *(__m64 *)q; |
585 |
|
572 |
|
586 |
dest0 = in_over(vsrc, vsrca, load8888(m0), |
573 |
dest0 = in_over(vsrc, vsrca, load8888(m0), |
587 |
expand8888 (vdest, 0)); |
574 |
expand8888 (vdest, 0)); |
588 |
dest1 = in_over(vsrc, vsrca, load8888(m1), |
575 |
dest1 = in_over(vsrc, vsrca, load8888(m1), |
589 |
expand8888 (vdest, 1)); |
576 |
expand8888 (vdest, 1)); |
590 |
|
577 |
|
591 |
*(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1); |
578 |
*(__m64 *)q = pack8888(dest0, dest1); |
592 |
} |
579 |
} |
593 |
|
580 |
|
594 |
p += 2; |
581 |
p += 2; |
Lines 602-610
Link Here
|
602 |
|
589 |
|
603 |
if (m) |
590 |
if (m) |
604 |
{ |
591 |
{ |
605 |
Vector4x16 vdest = load8888(*q); |
592 |
__m64 vdest = load8888(*q); |
606 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
593 |
vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
607 |
*q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
594 |
*q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
608 |
} |
595 |
} |
609 |
|
596 |
|
610 |
twidth--; |
597 |
twidth--; |
Lines 616-622
Link Here
|
616 |
maskLine += maskStride; |
603 |
maskLine += maskStride; |
617 |
} |
604 |
} |
618 |
|
605 |
|
619 |
emms(); |
606 |
_mm_empty(); |
|
|
607 |
} |
608 |
|
609 |
void |
610 |
fbCompositeSrc_8888x8x8888mmx (CARD8 op, |
611 |
PicturePtr pSrc, |
612 |
PicturePtr pMask, |
613 |
PicturePtr pDst, |
614 |
INT16 xSrc, |
615 |
INT16 ySrc, |
616 |
INT16 xMask, |
617 |
INT16 yMask, |
618 |
INT16 xDst, |
619 |
INT16 yDst, |
620 |
CARD16 width, |
621 |
CARD16 height) |
622 |
{ |
623 |
CARD32 *dstLine, *dst; |
624 |
CARD32 *srcLine, *src; |
625 |
CARD8 *maskLine; |
626 |
CARD32 mask; |
627 |
__m64 vmask; |
628 |
FbStride dstStride, srcStride, maskStride; |
629 |
CARD16 w; |
630 |
__m64 srca; |
631 |
|
632 |
CHECKPOINT(); |
633 |
|
634 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
635 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
636 |
fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); |
637 |
|
638 |
mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; |
639 |
vmask = load8888 (mask); |
640 |
srca = MC(4x00ff); |
641 |
|
642 |
while (height--) |
643 |
{ |
644 |
dst = dstLine; |
645 |
dstLine += dstStride; |
646 |
src = srcLine; |
647 |
srcLine += srcStride; |
648 |
w = width; |
649 |
|
650 |
while (w && (unsigned long)dst & 7) |
651 |
{ |
652 |
__m64 s = load8888 (*src); |
653 |
__m64 d = load8888 (*dst); |
654 |
|
655 |
*dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
656 |
|
657 |
w--; |
658 |
dst++; |
659 |
src++; |
660 |
} |
661 |
|
662 |
while (w >= 16) |
663 |
{ |
664 |
__m64 vd0 = *(__m64 *)(dst + 0); |
665 |
__m64 vd1 = *(__m64 *)(dst + 2); |
666 |
__m64 vd2 = *(__m64 *)(dst + 4); |
667 |
__m64 vd3 = *(__m64 *)(dst + 6); |
668 |
__m64 vd4 = *(__m64 *)(dst + 8); |
669 |
__m64 vd5 = *(__m64 *)(dst + 10); |
670 |
__m64 vd6 = *(__m64 *)(dst + 12); |
671 |
__m64 vd7 = *(__m64 *)(dst + 14); |
672 |
|
673 |
__m64 vs0 = *(__m64 *)(src + 0); |
674 |
__m64 vs1 = *(__m64 *)(src + 2); |
675 |
__m64 vs2 = *(__m64 *)(src + 4); |
676 |
__m64 vs3 = *(__m64 *)(src + 6); |
677 |
__m64 vs4 = *(__m64 *)(src + 8); |
678 |
__m64 vs5 = *(__m64 *)(src + 10); |
679 |
__m64 vs6 = *(__m64 *)(src + 12); |
680 |
__m64 vs7 = *(__m64 *)(dst + 14); |
681 |
|
682 |
vd0 = (__m64)pack8888 ( |
683 |
in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
684 |
in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
685 |
|
686 |
vd1 = (__m64)pack8888 ( |
687 |
in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
688 |
in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
689 |
|
690 |
vd2 = (__m64)pack8888 ( |
691 |
in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
692 |
in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
693 |
|
694 |
vd3 = (__m64)pack8888 ( |
695 |
in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
696 |
in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
697 |
|
698 |
vd4 = (__m64)pack8888 ( |
699 |
in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
700 |
in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
701 |
|
702 |
vd5 = (__m64)pack8888 ( |
703 |
in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
704 |
in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
705 |
|
706 |
vd6 = (__m64)pack8888 ( |
707 |
in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
708 |
in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
709 |
|
710 |
vd7 = (__m64)pack8888 ( |
711 |
in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
712 |
in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
713 |
|
714 |
w -= 16; |
715 |
dst += 16; |
716 |
src += 16; |
717 |
} |
718 |
|
719 |
while (w) |
720 |
{ |
721 |
__m64 s = load8888 (*src); |
722 |
__m64 d = load8888 (*dst); |
723 |
|
724 |
*dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
725 |
|
726 |
w--; |
727 |
dst++; |
728 |
src++; |
729 |
} |
730 |
} |
731 |
|
732 |
_mm_empty(); |
620 |
} |
733 |
} |
621 |
|
734 |
|
622 |
void |
735 |
void |
Lines 638-644
Link Here
|
638 |
CARD8 *maskLine, *mask; |
751 |
CARD8 *maskLine, *mask; |
639 |
FbStride dstStride, maskStride; |
752 |
FbStride dstStride, maskStride; |
640 |
CARD16 w; |
753 |
CARD16 w; |
641 |
Vector4x16 vsrc, vsrca; |
754 |
__m64 vsrc, vsrca; |
642 |
ullong srcsrc; |
755 |
ullong srcsrc; |
643 |
|
756 |
|
644 |
CHECKPOINT(); |
757 |
CHECKPOINT(); |
Lines 648-654
Link Here
|
648 |
srca = src >> 24; |
761 |
srca = src >> 24; |
649 |
if (srca == 0) |
762 |
if (srca == 0) |
650 |
return; |
763 |
return; |
651 |
|
764 |
|
652 |
srcsrc = (unsigned long long)src << 32 | src; |
765 |
srcsrc = (unsigned long long)src << 32 | src; |
653 |
|
766 |
|
654 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
767 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
Lines 664-670
Link Here
|
664 |
mask = maskLine; |
777 |
mask = maskLine; |
665 |
maskLine += maskStride; |
778 |
maskLine += maskStride; |
666 |
w = width; |
779 |
w = width; |
667 |
|
780 |
|
668 |
CHECKPOINT(); |
781 |
CHECKPOINT(); |
669 |
|
782 |
|
670 |
while (w && (unsigned long)dst & 7) |
783 |
while (w && (unsigned long)dst & 7) |
Lines 673-687
Link Here
|
673 |
|
786 |
|
674 |
if (m) |
787 |
if (m) |
675 |
{ |
788 |
{ |
676 |
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst)); |
789 |
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); |
677 |
*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
790 |
*dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
678 |
} |
791 |
} |
679 |
|
792 |
|
680 |
w--; |
793 |
w--; |
681 |
mask++; |
794 |
mask++; |
682 |
dst++; |
795 |
dst++; |
683 |
} |
796 |
} |
684 |
|
797 |
|
685 |
CHECKPOINT(); |
798 |
CHECKPOINT(); |
686 |
|
799 |
|
687 |
while (w >= 2) |
800 |
while (w >= 2) |
Lines 689-717
Link Here
|
689 |
ullong m0, m1; |
802 |
ullong m0, m1; |
690 |
m0 = *mask; |
803 |
m0 = *mask; |
691 |
m1 = *(mask + 1); |
804 |
m1 = *(mask + 1); |
692 |
|
805 |
|
693 |
if (srca == 0xff && (m0 & m1) == 0xff) |
806 |
if (srca == 0xff && (m0 & m1) == 0xff) |
694 |
{ |
807 |
{ |
695 |
*(unsigned long long *)dst = srcsrc; |
808 |
*(unsigned long long *)dst = srcsrc; |
696 |
} |
809 |
} |
697 |
else if (m0 | m1) |
810 |
else if (m0 | m1) |
698 |
{ |
811 |
{ |
699 |
Vector4x16 vdest; |
812 |
__m64 vdest; |
700 |
Vector4x16 dest0, dest1; |
813 |
__m64 dest0, dest1; |
701 |
|
814 |
|
702 |
vdest = *(Vector4x16 *)dst; |
815 |
vdest = *(__m64 *)dst; |
703 |
|
816 |
|
704 |
dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0)); |
817 |
dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); |
705 |
dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1)); |
818 |
dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); |
706 |
|
819 |
|
707 |
*(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); |
820 |
*(__m64 *)dst = pack8888(dest0, dest1); |
708 |
} |
821 |
} |
709 |
|
822 |
|
710 |
mask += 2; |
823 |
mask += 2; |
711 |
dst += 2; |
824 |
dst += 2; |
712 |
w -= 2; |
825 |
w -= 2; |
713 |
} |
826 |
} |
714 |
|
827 |
|
715 |
CHECKPOINT(); |
828 |
CHECKPOINT(); |
716 |
|
829 |
|
717 |
while (w) |
830 |
while (w) |
Lines 720-728
Link Here
|
720 |
|
833 |
|
721 |
if (m) |
834 |
if (m) |
722 |
{ |
835 |
{ |
723 |
Vector4x16 vdest = load8888(*dst); |
836 |
__m64 vdest = load8888(*dst); |
724 |
vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest); |
837 |
vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); |
725 |
*dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); |
838 |
*dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
726 |
} |
839 |
} |
727 |
|
840 |
|
728 |
w--; |
841 |
w--; |
Lines 731-737
Link Here
|
731 |
} |
844 |
} |
732 |
} |
845 |
} |
733 |
|
846 |
|
734 |
emms(); |
847 |
_mm_empty(); |
735 |
} |
848 |
} |
736 |
|
849 |
|
737 |
|
850 |
|
Lines 754-760
Link Here
|
754 |
CARD8 *maskLine, *mask; |
867 |
CARD8 *maskLine, *mask; |
755 |
FbStride dstStride, maskStride; |
868 |
FbStride dstStride, maskStride; |
756 |
CARD16 w; |
869 |
CARD16 w; |
757 |
Vector4x16 vsrc, vsrca; |
870 |
__m64 vsrc, vsrca; |
758 |
unsigned long long srcsrcsrcsrc, src16; |
871 |
unsigned long long srcsrcsrcsrc, src16; |
759 |
|
872 |
|
760 |
CHECKPOINT(); |
873 |
CHECKPOINT(); |
Lines 770-778
Link Here
|
770 |
|
883 |
|
771 |
vsrc = load8888 (src); |
884 |
vsrc = load8888 (src); |
772 |
vsrca = expand_alpha (vsrc); |
885 |
vsrca = expand_alpha (vsrc); |
773 |
|
886 |
|
774 |
src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0); |
887 |
src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); |
775 |
|
888 |
|
776 |
srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | |
889 |
srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | |
777 |
(ullong)src16 << 16 | (ullong)src16; |
890 |
(ullong)src16 << 16 | (ullong)src16; |
778 |
|
891 |
|
Lines 783-789
Link Here
|
783 |
mask = maskLine; |
896 |
mask = maskLine; |
784 |
maskLine += maskStride; |
897 |
maskLine += maskStride; |
785 |
w = width; |
898 |
w = width; |
786 |
|
899 |
|
787 |
CHECKPOINT(); |
900 |
CHECKPOINT(); |
788 |
|
901 |
|
789 |
while (w && (unsigned long)dst & 7) |
902 |
while (w && (unsigned long)dst & 7) |
Lines 793-808
Link Here
|
793 |
if (m) |
906 |
if (m) |
794 |
{ |
907 |
{ |
795 |
ullong d = *dst; |
908 |
ullong d = *dst; |
796 |
Vector4x16 vd = (Vector4x16)d; |
909 |
__m64 vd = (__m64)d; |
797 |
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); |
910 |
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
798 |
*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); |
911 |
*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
799 |
} |
912 |
} |
800 |
|
913 |
|
801 |
w--; |
914 |
w--; |
802 |
mask++; |
915 |
mask++; |
803 |
dst++; |
916 |
dst++; |
804 |
} |
917 |
} |
805 |
|
918 |
|
806 |
CHECKPOINT(); |
919 |
CHECKPOINT(); |
807 |
|
920 |
|
808 |
while (w >= 4) |
921 |
while (w >= 4) |
Lines 812-846
Link Here
|
812 |
m1 = *(mask + 1); |
925 |
m1 = *(mask + 1); |
813 |
m2 = *(mask + 2); |
926 |
m2 = *(mask + 2); |
814 |
m3 = *(mask + 3); |
927 |
m3 = *(mask + 3); |
815 |
|
928 |
|
816 |
if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
929 |
if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
817 |
{ |
930 |
{ |
818 |
*(unsigned long long *)dst = srcsrcsrcsrc; |
931 |
*(unsigned long long *)dst = srcsrcsrcsrc; |
819 |
} |
932 |
} |
820 |
else if (m0 | m1 | m2 | m3) |
933 |
else if (m0 | m1 | m2 | m3) |
821 |
{ |
934 |
{ |
822 |
Vector4x16 vdest; |
935 |
__m64 vdest; |
823 |
Vector4x16 vm0, vm1, vm2, vm3; |
936 |
__m64 vm0, vm1, vm2, vm3; |
824 |
|
937 |
|
825 |
vdest = *(Vector4x16 *)dst; |
938 |
vdest = *(__m64 *)dst; |
826 |
|
939 |
|
827 |
vm0 = (Vector4x16)m0; |
940 |
vm0 = (__m64)m0; |
828 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); |
941 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); |
829 |
vm1 = (Vector4x16)m1; |
942 |
vm1 = (__m64)m1; |
830 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); |
943 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); |
831 |
vm2 = (Vector4x16)m2; |
944 |
vm2 = (__m64)m2; |
832 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); |
945 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); |
833 |
vm3 = (Vector4x16)m3; |
946 |
vm3 = (__m64)m3; |
834 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); |
947 |
vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); |
835 |
|
948 |
|
836 |
*(Vector4x16 *)dst = vdest; |
949 |
*(__m64 *)dst = vdest; |
837 |
} |
950 |
} |
838 |
|
951 |
|
839 |
w -= 4; |
952 |
w -= 4; |
840 |
mask += 4; |
953 |
mask += 4; |
841 |
dst += 4; |
954 |
dst += 4; |
842 |
} |
955 |
} |
843 |
|
956 |
|
844 |
CHECKPOINT(); |
957 |
CHECKPOINT(); |
845 |
|
958 |
|
846 |
while (w) |
959 |
while (w) |
Lines 850-858
Link Here
|
850 |
if (m) |
963 |
if (m) |
851 |
{ |
964 |
{ |
852 |
ullong d = *dst; |
965 |
ullong d = *dst; |
853 |
Vector4x16 vd = (Vector4x16)d; |
966 |
__m64 vd = (__m64)d; |
854 |
Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); |
967 |
__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
855 |
*dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); |
968 |
*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
856 |
} |
969 |
} |
857 |
|
970 |
|
858 |
w--; |
971 |
w--; |
Lines 861-867
Link Here
|
861 |
} |
974 |
} |
862 |
} |
975 |
} |
863 |
|
976 |
|
864 |
emms(); |
977 |
_mm_empty(); |
865 |
} |
978 |
} |
866 |
|
979 |
|
867 |
void |
980 |
void |
Lines 887-895
Link Here
|
887 |
|
1000 |
|
888 |
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
1001 |
fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
889 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1002 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
890 |
|
1003 |
|
891 |
assert (pSrc->pDrawable == pMask->pDrawable); |
1004 |
assert (pSrc->pDrawable == pMask->pDrawable); |
892 |
|
1005 |
|
893 |
while (height--) |
1006 |
while (height--) |
894 |
{ |
1007 |
{ |
895 |
dst = dstLine; |
1008 |
dst = dstLine; |
Lines 897-910
Link Here
|
897 |
src = srcLine; |
1010 |
src = srcLine; |
898 |
srcLine += srcStride; |
1011 |
srcLine += srcStride; |
899 |
w = width; |
1012 |
w = width; |
900 |
|
1013 |
|
901 |
CHECKPOINT(); |
1014 |
CHECKPOINT(); |
902 |
|
1015 |
|
903 |
while (w && (unsigned long)dst & 7) |
1016 |
while (w && (unsigned long)dst & 7) |
904 |
{ |
1017 |
{ |
905 |
Vector4x16 vsrc = load8888 (*src); |
1018 |
__m64 vsrc = load8888 (*src); |
906 |
ullong d = *dst; |
1019 |
ullong d = *dst; |
907 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1020 |
__m64 vdest = expand565 ((__m64)d, 0); |
908 |
|
1021 |
|
909 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
1022 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
910 |
|
1023 |
|
Lines 914-932
Link Here
|
914 |
dst++; |
1027 |
dst++; |
915 |
src++; |
1028 |
src++; |
916 |
} |
1029 |
} |
917 |
|
1030 |
|
918 |
CHECKPOINT(); |
1031 |
CHECKPOINT(); |
919 |
|
1032 |
|
920 |
while (w >= 4) |
1033 |
while (w >= 4) |
921 |
{ |
1034 |
{ |
922 |
CARD32 s0, s1, s2, s3; |
1035 |
CARD32 s0, s1, s2, s3; |
923 |
unsigned char a0, a1, a2, a3; |
1036 |
unsigned char a0, a1, a2, a3; |
924 |
|
1037 |
|
925 |
s0 = *src; |
1038 |
s0 = *src; |
926 |
s1 = *(src + 1); |
1039 |
s1 = *(src + 1); |
927 |
s2 = *(src + 2); |
1040 |
s2 = *(src + 2); |
928 |
s3 = *(src + 3); |
1041 |
s3 = *(src + 3); |
929 |
|
1042 |
|
930 |
a0 = (s0 >> 24); |
1043 |
a0 = (s0 >> 24); |
931 |
a1 = (s1 >> 24); |
1044 |
a1 = (s1 >> 24); |
932 |
a2 = (s2 >> 24); |
1045 |
a2 = (s2 >> 24); |
Lines 934-971
Link Here
|
934 |
|
1047 |
|
935 |
if ((a0 & a1 & a2 & a3) == 0xFF) |
1048 |
if ((a0 & a1 & a2 & a3) == 0xFF) |
936 |
{ |
1049 |
{ |
937 |
Vector4x16 vdest; |
1050 |
__m64 vdest; |
938 |
vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0); |
1051 |
vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); |
939 |
vdest = pack565(invert_colors(load8888(s1)), vdest, 1); |
1052 |
vdest = pack565(invert_colors(load8888(s1)), vdest, 1); |
940 |
vdest = pack565(invert_colors(load8888(s2)), vdest, 2); |
1053 |
vdest = pack565(invert_colors(load8888(s2)), vdest, 2); |
941 |
vdest = pack565(invert_colors(load8888(s3)), vdest, 3); |
1054 |
vdest = pack565(invert_colors(load8888(s3)), vdest, 3); |
942 |
|
1055 |
|
943 |
*(Vector4x16 *)dst = vdest; |
1056 |
*(__m64 *)dst = vdest; |
944 |
} |
1057 |
} |
945 |
else if (a0 | a1 | a2 | a3) |
1058 |
else if (a0 | a1 | a2 | a3) |
946 |
{ |
1059 |
{ |
947 |
Vector4x16 vdest = *(Vector4x16 *)dst; |
1060 |
__m64 vdest = *(__m64 *)dst; |
948 |
|
1061 |
|
949 |
vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); |
1062 |
vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); |
950 |
vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); |
1063 |
vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); |
951 |
vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); |
1064 |
vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); |
952 |
vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); |
1065 |
vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); |
953 |
|
1066 |
|
954 |
*(Vector4x16 *)dst = vdest; |
1067 |
*(__m64 *)dst = vdest; |
955 |
} |
1068 |
} |
956 |
|
1069 |
|
957 |
w -= 4; |
1070 |
w -= 4; |
958 |
dst += 4; |
1071 |
dst += 4; |
959 |
src += 4; |
1072 |
src += 4; |
960 |
} |
1073 |
} |
961 |
|
1074 |
|
962 |
CHECKPOINT(); |
1075 |
CHECKPOINT(); |
963 |
|
1076 |
|
964 |
while (w) |
1077 |
while (w) |
965 |
{ |
1078 |
{ |
966 |
Vector4x16 vsrc = load8888 (*src); |
1079 |
__m64 vsrc = load8888 (*src); |
967 |
ullong d = *dst; |
1080 |
ullong d = *dst; |
968 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1081 |
__m64 vdest = expand565 ((__m64)d, 0); |
969 |
|
1082 |
|
970 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
1083 |
vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
971 |
|
1084 |
|
Lines 976-986
Link Here
|
976 |
src++; |
1089 |
src++; |
977 |
} |
1090 |
} |
978 |
} |
1091 |
} |
979 |
|
1092 |
|
980 |
emms(); |
1093 |
_mm_empty(); |
981 |
} |
1094 |
} |
982 |
|
1095 |
|
983 |
/* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
1096 |
/* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
984 |
|
1097 |
|
985 |
void |
1098 |
void |
986 |
fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, |
1099 |
fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, |
Lines 1005-1013
Link Here
|
1005 |
|
1118 |
|
1006 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
1119 |
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
1007 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1120 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
1008 |
|
1121 |
|
1009 |
assert (pSrc->pDrawable == pMask->pDrawable); |
1122 |
assert (pSrc->pDrawable == pMask->pDrawable); |
1010 |
|
1123 |
|
1011 |
while (height--) |
1124 |
while (height--) |
1012 |
{ |
1125 |
{ |
1013 |
dst = dstLine; |
1126 |
dst = dstLine; |
Lines 1015-1042
Link Here
|
1015 |
src = srcLine; |
1128 |
src = srcLine; |
1016 |
srcLine += srcStride; |
1129 |
srcLine += srcStride; |
1017 |
w = width; |
1130 |
w = width; |
1018 |
|
1131 |
|
1019 |
while (w && (unsigned long)dst & 7) |
1132 |
while (w && (unsigned long)dst & 7) |
1020 |
{ |
1133 |
{ |
1021 |
Vector4x16 s = load8888 (*src); |
1134 |
__m64 s = load8888 (*src); |
1022 |
Vector4x16 d = load8888 (*dst); |
1135 |
__m64 d = load8888 (*dst); |
1023 |
|
1136 |
|
1024 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); |
1137 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
1025 |
|
1138 |
|
1026 |
w--; |
1139 |
w--; |
1027 |
dst++; |
1140 |
dst++; |
1028 |
src++; |
1141 |
src++; |
1029 |
} |
1142 |
} |
1030 |
|
1143 |
|
1031 |
while (w >= 2) |
1144 |
while (w >= 2) |
1032 |
{ |
1145 |
{ |
1033 |
ullong s0, s1; |
1146 |
ullong s0, s1; |
1034 |
unsigned char a0, a1; |
1147 |
unsigned char a0, a1; |
1035 |
Vector4x16 d0, d1; |
1148 |
__m64 d0, d1; |
1036 |
|
1149 |
|
1037 |
s0 = *src; |
1150 |
s0 = *src; |
1038 |
s1 = *(src + 1); |
1151 |
s1 = *(src + 1); |
1039 |
|
1152 |
|
1040 |
a0 = (s0 >> 24); |
1153 |
a0 = (s0 >> 24); |
1041 |
a1 = (s1 >> 24); |
1154 |
a1 = (s1 >> 24); |
1042 |
|
1155 |
|
Lines 1044-1060
Link Here
|
1044 |
{ |
1157 |
{ |
1045 |
d0 = invert_colors(load8888(s0)); |
1158 |
d0 = invert_colors(load8888(s0)); |
1046 |
d1 = invert_colors(load8888(s1)); |
1159 |
d1 = invert_colors(load8888(s1)); |
1047 |
|
1160 |
|
1048 |
*(Vector8x8 *)dst = pack8888 (d0, d1); |
1161 |
*(__m64 *)dst = pack8888 (d0, d1); |
1049 |
} |
1162 |
} |
1050 |
else if (a0 | a1) |
1163 |
else if (a0 | a1) |
1051 |
{ |
1164 |
{ |
1052 |
Vector4x16 vdest = *(Vector4x16 *)dst; |
1165 |
__m64 vdest = *(__m64 *)dst; |
1053 |
|
1166 |
|
1054 |
d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); |
1167 |
d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); |
1055 |
d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); |
1168 |
d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); |
1056 |
|
1169 |
|
1057 |
*(Vector8x8 *)dst = pack8888 (d0, d1); |
1170 |
*(__m64 *)dst = pack8888 (d0, d1); |
1058 |
} |
1171 |
} |
1059 |
|
1172 |
|
1060 |
w -= 2; |
1173 |
w -= 2; |
Lines 1064-1081
Link Here
|
1064 |
|
1177 |
|
1065 |
while (w) |
1178 |
while (w) |
1066 |
{ |
1179 |
{ |
1067 |
Vector4x16 s = load8888 (*src); |
1180 |
__m64 s = load8888 (*src); |
1068 |
Vector4x16 d = load8888 (*dst); |
1181 |
__m64 d = load8888 (*dst); |
1069 |
|
1182 |
|
1070 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); |
1183 |
*dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
1071 |
|
1184 |
|
1072 |
w--; |
1185 |
w--; |
1073 |
dst++; |
1186 |
dst++; |
1074 |
src++; |
1187 |
src++; |
1075 |
} |
1188 |
} |
1076 |
} |
1189 |
} |
1077 |
|
1190 |
|
1078 |
emms(); |
1191 |
_mm_empty(); |
1079 |
} |
1192 |
} |
1080 |
|
1193 |
|
1081 |
void |
1194 |
void |
Lines 1096-1102
Link Here
|
1096 |
CARD16 *dstLine; |
1209 |
CARD16 *dstLine; |
1097 |
CARD32 *maskLine; |
1210 |
CARD32 *maskLine; |
1098 |
FbStride dstStride, maskStride; |
1211 |
FbStride dstStride, maskStride; |
1099 |
Vector4x16 vsrc, vsrca; |
1212 |
__m64 vsrc, vsrca; |
1100 |
|
1213 |
|
1101 |
CHECKPOINT(); |
1214 |
CHECKPOINT(); |
1102 |
|
1215 |
|
Lines 1125-1131
Link Here
|
1125 |
if (m) |
1238 |
if (m) |
1126 |
{ |
1239 |
{ |
1127 |
ullong d = *q; |
1240 |
ullong d = *q; |
1128 |
Vector4x16 vdest = expand565 ((Vector4x16)d, 0); |
1241 |
__m64 vdest = expand565 ((__m64)d, 0); |
1129 |
vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
1242 |
vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
1130 |
*q = (ullong)vdest; |
1243 |
*q = (ullong)vdest; |
1131 |
} |
1244 |
} |
Lines 1146-1159
Link Here
|
1146 |
|
1259 |
|
1147 |
if ((m0 | m1 | m2 | m3)) |
1260 |
if ((m0 | m1 | m2 | m3)) |
1148 |
{ |
1261 |
{ |
1149 |
Vector4x16 vdest = *(Vector4x16 *)q; |
1262 |
__m64 vdest = *(__m64 *)q; |
1150 |
|
1263 |
|
1151 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); |
1264 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); |
1152 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); |
1265 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); |
1153 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); |
1266 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); |
1154 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); |
1267 |
vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); |
1155 |
|
1268 |
|
1156 |
*(Vector4x16 *)q = vdest; |
1269 |
*(__m64 *)q = vdest; |
1157 |
} |
1270 |
} |
1158 |
twidth -= 4; |
1271 |
twidth -= 4; |
1159 |
p += 4; |
1272 |
p += 4; |
Lines 1168-1174
Link Here
|
1168 |
if (m) |
1281 |
if (m) |
1169 |
{ |
1282 |
{ |
1170 |
ullong d = *q; |
1283 |
ullong d = *q; |
1171 |
Vector4x16 vdest = expand565((Vector4x16)d, 0); |
1284 |
__m64 vdest = expand565((__m64)d, 0); |
1172 |
vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); |
1285 |
vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); |
1173 |
*q = (ullong)vdest; |
1286 |
*q = (ullong)vdest; |
1174 |
} |
1287 |
} |
Lines 1182-1188
Link Here
|
1182 |
dstLine += dstStride; |
1295 |
dstLine += dstStride; |
1183 |
} |
1296 |
} |
1184 |
|
1297 |
|
1185 |
emms (); |
1298 |
_mm_empty (); |
1186 |
} |
1299 |
} |
1187 |
|
1300 |
|
1188 |
void |
1301 |
void |
Lines 1210-1216
Link Here
|
1210 |
|
1323 |
|
1211 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); |
1324 |
fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); |
1212 |
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); |
1325 |
fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); |
1213 |
|
1326 |
|
1214 |
while (height--) |
1327 |
while (height--) |
1215 |
{ |
1328 |
{ |
1216 |
dst = dstLine; |
1329 |
dst = dstLine; |
Lines 1218-1224
Link Here
|
1218 |
src = srcLine; |
1331 |
src = srcLine; |
1219 |
srcLine += srcStride; |
1332 |
srcLine += srcStride; |
1220 |
w = width; |
1333 |
w = width; |
1221 |
|
1334 |
|
1222 |
while (w && (unsigned long)dst & 7) |
1335 |
while (w && (unsigned long)dst & 7) |
1223 |
{ |
1336 |
{ |
1224 |
s = *src; |
1337 |
s = *src; |
Lines 1234-1246
Link Here
|
1234 |
|
1347 |
|
1235 |
while (w >= 8) |
1348 |
while (w >= 8) |
1236 |
{ |
1349 |
{ |
1237 |
__asm__ __volatile__ ( |
1350 |
*(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
1238 |
"movq (%0), %%mm2\n\t" |
|
|
1239 |
"movq (%1), %%mm3\n\t" |
1240 |
"paddusb %%mm2, %%mm3\n\t" |
1241 |
"movq %%mm3, (%1)\n\t" |
1242 |
: /* no output */ : "r" (src), "r" (dst)); |
1243 |
|
1244 |
dst += 8; |
1351 |
dst += 8; |
1245 |
src += 8; |
1352 |
src += 8; |
1246 |
w -= 8; |
1353 |
w -= 8; |
Lines 1259-1266
Link Here
|
1259 |
w--; |
1366 |
w--; |
1260 |
} |
1367 |
} |
1261 |
} |
1368 |
} |
1262 |
|
1369 |
|
1263 |
emms(); |
1370 |
_mm_empty(); |
1264 |
} |
1371 |
} |
1265 |
|
1372 |
|
1266 |
void |
1373 |
void |
Lines 1297-1309
Link Here
|
1297 |
|
1404 |
|
1298 |
while (w && (unsigned long)dst & 7) |
1405 |
while (w && (unsigned long)dst & 7) |
1299 |
{ |
1406 |
{ |
1300 |
__asm__ __volatile__ ( |
1407 |
*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
1301 |
"movd %0, %%mm2\n\t" |
1408 |
_mm_cvtsi32_si64(*dst))); |
1302 |
"movd %1, %%mm3\n\t" |
|
|
1303 |
"paddusb %%mm2, %%mm3\n\t" |
1304 |
"movd %%mm3, %1\n\t" |
1305 |
: /* no output */ : "m" (*src), "m" (*dst)); |
1306 |
|
1307 |
dst++; |
1409 |
dst++; |
1308 |
src++; |
1410 |
src++; |
1309 |
w--; |
1411 |
w--; |
Lines 1311-1323
Link Here
|
1311 |
|
1413 |
|
1312 |
while (w >= 2) |
1414 |
while (w >= 2) |
1313 |
{ |
1415 |
{ |
1314 |
__asm__ __volatile__ ( |
1416 |
*(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
1315 |
"movq (%0), %%mm2\n\t" |
|
|
1316 |
"movq (%1), %%mm3\n\t" |
1317 |
"paddusb %%mm2, %%mm3\n\t" |
1318 |
"movq %%mm3, (%1)\n\t" |
1319 |
: /* no output */ : "r" (src), "r" (dst)); |
1320 |
|
1321 |
dst += 2; |
1417 |
dst += 2; |
1322 |
src += 2; |
1418 |
src += 2; |
1323 |
w -= 2; |
1419 |
w -= 2; |
Lines 1325-1340
Link Here
|
1325 |
|
1421 |
|
1326 |
if (w) |
1422 |
if (w) |
1327 |
{ |
1423 |
{ |
1328 |
__asm__ __volatile__ ( |
1424 |
*dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
1329 |
"movd %0, %%mm2\n\t" |
1425 |
_mm_cvtsi32_si64(*dst))); |
1330 |
"movd %1, %%mm3\n\t" |
1426 |
|
1331 |
"paddusb %%mm2, %%mm3\n\t" |
|
|
1332 |
"movd %%mm3, %1\n\t" |
1333 |
: /* no output */ : "m" (*src), "m" (*dst)); |
1334 |
} |
1427 |
} |
1335 |
} |
1428 |
} |
1336 |
|
1429 |
|
1337 |
emms(); |
1430 |
_mm_empty(); |
1338 |
} |
1431 |
} |
1339 |
|
1432 |
|
1340 |
#define GetStart(drw,x,y,type,stride,line,bpp) {\ |
1433 |
#define GetStart(drw,x,y,type,stride,line,bpp) {\ |
Lines 1358-1376
Link Here
|
1358 |
FbStride stride; |
1451 |
FbStride stride; |
1359 |
int bpp; |
1452 |
int bpp; |
1360 |
ullong fill; |
1453 |
ullong fill; |
1361 |
Vector8x8 vfill; |
1454 |
__m64 vfill; |
1362 |
CARD32 byte_width; |
1455 |
CARD32 byte_width; |
1363 |
CARD8 *byte_line; |
1456 |
CARD8 *byte_line; |
1364 |
FbBits *bits; |
1457 |
FbBits *bits; |
1365 |
int xoff, yoff; |
1458 |
int xoff, yoff; |
1366 |
|
1459 |
|
1367 |
CHECKPOINT(); |
1460 |
CHECKPOINT(); |
1368 |
|
1461 |
|
1369 |
fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); |
1462 |
fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); |
1370 |
|
1463 |
|
1371 |
if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) |
1464 |
if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) |
1372 |
return FALSE; |
1465 |
return FALSE; |
1373 |
|
1466 |
|
1374 |
if (bpp != 16 && bpp != 32) |
1467 |
if (bpp != 16 && bpp != 32) |
1375 |
return FALSE; |
1468 |
return FALSE; |
1376 |
|
1469 |
|
Lines 1388-1396
Link Here
|
1388 |
byte_width = 4 * width; |
1481 |
byte_width = 4 * width; |
1389 |
stride *= 4; |
1482 |
stride *= 4; |
1390 |
} |
1483 |
} |
1391 |
|
1484 |
|
1392 |
fill = ((ullong)xor << 32) | xor; |
1485 |
fill = ((ullong)xor << 32) | xor; |
1393 |
vfill = (Vector8x8)fill; |
1486 |
vfill = (__m64)fill; |
1394 |
|
1487 |
|
1395 |
while (height--) |
1488 |
while (height--) |
1396 |
{ |
1489 |
{ |
Lines 1398-1404
Link Here
|
1398 |
CARD8 *d = byte_line; |
1491 |
CARD8 *d = byte_line; |
1399 |
byte_line += stride; |
1492 |
byte_line += stride; |
1400 |
w = byte_width; |
1493 |
w = byte_width; |
1401 |
|
1494 |
|
1402 |
while (w >= 2 && ((unsigned long)d & 3)) |
1495 |
while (w >= 2 && ((unsigned long)d & 3)) |
1403 |
{ |
1496 |
{ |
1404 |
*(CARD16 *)d = xor; |
1497 |
*(CARD16 *)d = xor; |
Lines 1406-1440
Link Here
|
1406 |
d += 2; |
1499 |
d += 2; |
1407 |
} |
1500 |
} |
1408 |
|
1501 |
|
1409 |
while (w >= 4 && ((unsigned int)d & 7)) |
1502 |
while (w >= 4 && ((unsigned long)d & 7)) |
1410 |
{ |
1503 |
{ |
1411 |
*(CARD32 *)d = xor; |
1504 |
*(CARD32 *)d = xor; |
1412 |
|
1505 |
|
1413 |
w -= 4; |
1506 |
w -= 4; |
1414 |
d += 4; |
1507 |
d += 4; |
1415 |
} |
1508 |
} |
1416 |
|
1509 |
|
1417 |
while (w >= 64) |
1510 |
while (w >= 64) |
1418 |
{ |
1511 |
{ |
1419 |
__asm__ __volatile ( |
1512 |
*(__m64*) (d + 0) = vfill; |
1420 |
"movq %0, (%1)\n\t" |
1513 |
*(__m64*) (d + 8) = vfill; |
1421 |
"movq %0, 8(%1)\n\t" |
1514 |
*(__m64*) (d + 16) = vfill; |
1422 |
"movq %0, 16(%1)\n\t" |
1515 |
*(__m64*) (d + 24) = vfill; |
1423 |
"movq %0, 24(%1)\n\t" |
1516 |
*(__m64*) (d + 32) = vfill; |
1424 |
"movq %0, 32(%1)\n\t" |
1517 |
*(__m64*) (d + 40) = vfill; |
1425 |
"movq %0, 40(%1)\n\t" |
1518 |
*(__m64*) (d + 48) = vfill; |
1426 |
"movq %0, 48(%1)\n\t" |
1519 |
*(__m64*) (d + 56) = vfill; |
1427 |
"movq %0, 56(%1)\n\t" |
1520 |
|
1428 |
: /* no output */ |
|
|
1429 |
: "y" (vfill), "r" (d) |
1430 |
: "memory"); |
1431 |
w -= 64; |
1521 |
w -= 64; |
1432 |
d += 64; |
1522 |
d += 64; |
1433 |
} |
1523 |
} |
1434 |
while (w >= 4) |
1524 |
while (w >= 4) |
1435 |
{ |
1525 |
{ |
1436 |
*(CARD32 *)d = xor; |
1526 |
*(CARD32 *)d = xor; |
1437 |
|
1527 |
|
1438 |
w -= 4; |
1528 |
w -= 4; |
1439 |
d += 4; |
1529 |
d += 4; |
1440 |
} |
1530 |
} |
Lines 1446-1461
Link Here
|
1446 |
} |
1536 |
} |
1447 |
} |
1537 |
} |
1448 |
|
1538 |
|
1449 |
emms(); |
1539 |
_mm_empty(); |
|
|
1540 |
return TRUE; |
1541 |
} |
1542 |
|
1543 |
Bool |
1544 |
fbCopyAreammx (DrawablePtr pSrc, |
1545 |
DrawablePtr pDst, |
1546 |
int src_x, |
1547 |
int src_y, |
1548 |
int dst_x, |
1549 |
int dst_y, |
1550 |
int width, |
1551 |
int height) |
1552 |
{ |
1553 |
FbBits * src_bits; |
1554 |
FbStride src_stride; |
1555 |
int src_bpp; |
1556 |
int src_xoff; |
1557 |
int src_yoff; |
1558 |
|
1559 |
FbBits * dst_bits; |
1560 |
FbStride dst_stride; |
1561 |
int dst_bpp; |
1562 |
int dst_xoff; |
1563 |
int dst_yoff; |
1564 |
|
1565 |
CARD8 * src_bytes; |
1566 |
CARD8 * dst_bytes; |
1567 |
int byte_width; |
1568 |
|
1569 |
fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); |
1570 |
fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); |
1571 |
|
1572 |
if (src_bpp != 16 && src_bpp != 32) |
1573 |
return FALSE; |
1574 |
|
1575 |
if (dst_bpp != 16 && dst_bpp != 32) |
1576 |
return FALSE; |
1577 |
|
1578 |
if (src_bpp != dst_bpp) |
1579 |
{ |
1580 |
return FALSE; |
1581 |
} |
1582 |
|
1583 |
if (src_bpp == 16) |
1584 |
{ |
1585 |
src_stride = src_stride * sizeof (FbBits) / 2; |
1586 |
dst_stride = dst_stride * sizeof (FbBits) / 2; |
1587 |
src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
1588 |
dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
1589 |
byte_width = 2 * width; |
1590 |
src_stride *= 2; |
1591 |
dst_stride *= 2; |
1592 |
} |
1593 |
else |
1594 |
{ |
1595 |
src_stride = src_stride * sizeof (FbBits) / 4; |
1596 |
dst_stride = dst_stride * sizeof (FbBits) / 4; |
1597 |
src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
1598 |
dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
1599 |
byte_width = 4 * width; |
1600 |
src_stride *= 4; |
1601 |
dst_stride *= 4; |
1602 |
} |
1603 |
|
1604 |
while (height--) |
1605 |
{ |
1606 |
int w; |
1607 |
CARD8 *s = src_bytes; |
1608 |
CARD8 *d = dst_bytes; |
1609 |
src_bytes += src_stride; |
1610 |
dst_bytes += dst_stride; |
1611 |
w = byte_width; |
1612 |
|
1613 |
while (w >= 2 && ((unsigned long)d & 3)) |
1614 |
{ |
1615 |
*(CARD16 *)d = *(CARD16 *)s; |
1616 |
w -= 2; |
1617 |
s += 2; |
1618 |
d += 2; |
1619 |
} |
1620 |
|
1621 |
while (w >= 4 && ((unsigned int)d & 7)) |
1622 |
{ |
1623 |
*(CARD32 *)d = *(CARD32 *)s; |
1624 |
|
1625 |
w -= 4; |
1626 |
s += 4; |
1627 |
d += 4; |
1628 |
} |
1629 |
|
1630 |
while (w >= 64) |
1631 |
{ |
1632 |
*(__m64 *)(d + 0) = *(__m64 *)(s + 0); |
1633 |
*(__m64 *)(d + 8) = *(__m64 *)(s + 8); |
1634 |
*(__m64 *)(d + 16) = *(__m64 *)(s + 16); |
1635 |
*(__m64 *)(d + 24) = *(__m64 *)(s + 24); |
1636 |
*(__m64 *)(d + 32) = *(__m64 *)(s + 32); |
1637 |
*(__m64 *)(d + 40) = *(__m64 *)(s + 40); |
1638 |
*(__m64 *)(d + 48) = *(__m64 *)(s + 48); |
1639 |
*(__m64 *)(d + 56) = *(__m64 *)(s + 56); |
1640 |
w -= 64; |
1641 |
s += 64; |
1642 |
d += 64; |
1643 |
} |
1644 |
while (w >= 4) |
1645 |
{ |
1646 |
*(CARD32 *)d = *(CARD32 *)s; |
1647 |
|
1648 |
w -= 4; |
1649 |
s += 4; |
1650 |
d += 4; |
1651 |
} |
1652 |
if (w >= 2) |
1653 |
{ |
1654 |
*(CARD16 *)d = *(CARD16 *)s; |
1655 |
w -= 2; |
1656 |
s += 2; |
1657 |
d += 2; |
1658 |
} |
1659 |
} |
1660 |
|
1661 |
_mm_empty(); |
1450 |
return TRUE; |
1662 |
return TRUE; |
1451 |
} |
1663 |
} |
1452 |
|
1664 |
|
|
|
1665 |
void |
1666 |
fbCompositeCopyAreammx (CARD8 op, |
1667 |
PicturePtr pSrc, |
1668 |
PicturePtr pMask, |
1669 |
PicturePtr pDst, |
1670 |
INT16 xSrc, |
1671 |
INT16 ySrc, |
1672 |
INT16 xMask, |
1673 |
INT16 yMask, |
1674 |
INT16 xDst, |
1675 |
INT16 yDst, |
1676 |
CARD16 width, |
1677 |
CARD16 height) |
1678 |
{ |
1679 |
fbCopyAreammx (pSrc->pDrawable, |
1680 |
pDst->pDrawable, |
1681 |
xSrc, ySrc, |
1682 |
xDst, yDst, |
1683 |
width, height); |
1684 |
} |
1685 |
|
1686 |
#ifndef __amd64__ |
1453 |
Bool |
1687 |
Bool |
1454 |
fbHaveMMX (void) |
1688 |
fbHaveMMX (void) |
1455 |
{ |
1689 |
{ |
1456 |
static Bool initialized = FALSE; |
1690 |
static Bool initialized = FALSE; |
1457 |
static Bool mmx_present; |
1691 |
static Bool mmx_present; |
1458 |
|
1692 |
|
1459 |
if (!initialized) |
1693 |
if (!initialized) |
1460 |
{ |
1694 |
{ |
1461 |
int tmp; /* static variables are accessed through %ebx, |
1695 |
int tmp; /* static variables are accessed through %ebx, |
Lines 1466-1472
Link Here
|
1466 |
|
1700 |
|
1467 |
__asm__ __volatile__ ( |
1701 |
__asm__ __volatile__ ( |
1468 |
/* Check if bit 21 in flags word is writeable */ |
1702 |
/* Check if bit 21 in flags word is writeable */ |
1469 |
|
1703 |
|
1470 |
"pusha \n\t" |
1704 |
"pusha \n\t" |
1471 |
"pushfl \n\t" |
1705 |
"pushfl \n\t" |
1472 |
"popl %%eax \n\t" |
1706 |
"popl %%eax \n\t" |
Lines 1502-1514
Link Here
|
1502 |
: /* no input */); |
1736 |
: /* no input */); |
1503 |
|
1737 |
|
1504 |
initialized = TRUE; |
1738 |
initialized = TRUE; |
1505 |
|
1739 |
|
1506 |
mmx_present = tmp; |
1740 |
mmx_present = tmp; |
1507 |
} |
1741 |
} |
1508 |
|
1742 |
|
1509 |
return mmx_present; |
1743 |
return mmx_present; |
1510 |
} |
1744 |
} |
|
|
1745 |
#endif /* __amd64__ */ |
1511 |
|
1746 |
|
1512 |
|
1747 |
|
1513 |
#endif /* RENDER */ |
1748 |
#endif /* RENDER */ |
1514 |
#endif /* USE_GCC34_MMX */ |
1749 |
#endif /* USE_MMX */ |