Line 0
Link Here
|
|
|
1 |
/******************************************************************** |
2 |
* * |
3 |
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
4 |
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
5 |
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
6 |
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
7 |
* * |
8 |
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 * |
9 |
* by the Xiph.Org Foundation http://www.xiph.org/ * |
10 |
* * |
11 |
******************************************************************** |
12 |
|
13 |
function: |
14 |
last mod: $Id: mcomp.c,v 1.8 2003/12/03 08:59:41 arc Exp $ |
15 |
|
16 |
********************************************************************/ |
17 |
|
18 |
#include <stdlib.h> |
19 |
#include "dsp.h" |
20 |
|
21 |
static const __attribute__ ((aligned(8),used)) ogg_int64_t V128w = 0x0080008000800080LL; |
22 |
|
23 |
#if defined(__MINGW32__) || defined(__CYGWIN__) || \ |
24 |
defined(__OS2__) || (defined (__OpenBSD__) && !defined(__ELF__)) |
25 |
# define M(a) "_" #a |
26 |
#else |
27 |
# define M(a) #a |
28 |
#endif |
29 |
|
30 |
#define DSP_OP_AVG(a,b) ((((int)(a)) + ((int)(b)))/2) |
31 |
#define DSP_OP_DIFF(a,b) (((int)(a)) - ((int)(b))) |
32 |
#define DSP_OP_ABS_DIFF(a,b) abs((((int)(a)) - ((int)(b)))) |
33 |
|
34 |
static void sub8x8__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr, |
35 |
ogg_int16_t *DctInputPtr, ogg_uint32_t PixelsPerLine, |
36 |
ogg_uint32_t ReconPixelsPerLine) |
37 |
{ |
38 |
__asm__ __volatile__ ( |
39 |
" .balign 16 \n\t" |
40 |
|
41 |
" pxor %%mm7, %%mm7 \n\t" |
42 |
|
43 |
".rept 8 \n\t" |
44 |
" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ |
45 |
" movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr */ |
46 |
" movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ |
47 |
" movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ |
48 |
/* convert from UINT8 to INT16 */ |
49 |
" punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ |
50 |
" punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr) */ |
51 |
" punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ |
52 |
" punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr) */ |
53 |
/* start calculation */ |
54 |
" psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ReconPtr */ |
55 |
" psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ReconPtr */ |
56 |
" movq %%mm0, (%2) \n\t" /* write answer out */ |
57 |
" movq %%mm2, 8(%2) \n\t" /* write answer out */ |
58 |
/* Increment pointers */ |
59 |
" add $16, %2 \n\t" |
60 |
" add %3, %0 \n\t" |
61 |
" add %4, %1 \n\t" |
62 |
".endr \n\t" |
63 |
|
64 |
: "+r" (FiltPtr), |
65 |
"+r" (ReconPtr), |
66 |
"+r" (DctInputPtr) |
67 |
: "m" (PixelsPerLine), |
68 |
"m" (ReconPixelsPerLine) |
69 |
: "memory" |
70 |
); |
71 |
} |
72 |
|
73 |
static void sub8x8_128__mmx (unsigned char *FiltPtr, ogg_int16_t *DctInputPtr, |
74 |
ogg_uint32_t PixelsPerLine) |
75 |
{ |
76 |
__asm__ __volatile__ ( |
77 |
" .balign 16 \n\t" |
78 |
|
79 |
" pxor %%mm7, %%mm7 \n\t" |
80 |
" movq "M(V128w)", %%mm1 \n\t" |
81 |
|
82 |
".rept 8 \n\t" |
83 |
" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ |
84 |
" movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ |
85 |
/* convert from UINT8 to INT16 */ |
86 |
" punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ |
87 |
" punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ |
88 |
/* start calculation */ |
89 |
" psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - 128 */ |
90 |
" psubw %%mm1, %%mm2 \n\t" /* mm2 = FiltPtr - 128 */ |
91 |
" movq %%mm0, (%1) \n\t" /* write answer out */ |
92 |
" movq %%mm2, 8(%1) \n\t" /* write answer out */ |
93 |
/* Increment pointers */ |
94 |
" add $16, %1 \n\t" |
95 |
" add %2, %0 \n\t" |
96 |
".endr \n\t" |
97 |
|
98 |
: "+r" (FiltPtr), |
99 |
"+r" (DctInputPtr) |
100 |
: "r" (PixelsPerLine) |
101 |
: "memory" |
102 |
); |
103 |
} |
104 |
|
105 |
static void sub8x8avg2__mmx (unsigned char *FiltPtr, unsigned char *ReconPtr1, |
106 |
unsigned char *ReconPtr2, ogg_int16_t *DctInputPtr, |
107 |
ogg_uint32_t PixelsPerLine, |
108 |
ogg_uint32_t ReconPixelsPerLine) |
109 |
{ |
110 |
__asm__ __volatile__ ( |
111 |
" .balign 16 \n\t" |
112 |
|
113 |
" pxor %%mm7, %%mm7 \n\t" |
114 |
|
115 |
".rept 8 \n\t" |
116 |
" movq (%0), %%mm0 \n\t" /* mm0 = FiltPtr */ |
117 |
" movq (%1), %%mm1 \n\t" /* mm1 = ReconPtr1 */ |
118 |
" movq (%2), %%mm4 \n\t" /* mm1 = ReconPtr2 */ |
119 |
" movq %%mm0, %%mm2 \n\t" /* dup to prepare for up conversion */ |
120 |
" movq %%mm1, %%mm3 \n\t" /* dup to prepare for up conversion */ |
121 |
" movq %%mm4, %%mm5 \n\t" /* dup to prepare for up conversion */ |
122 |
/* convert from UINT8 to INT16 */ |
123 |
" punpcklbw %%mm7, %%mm0 \n\t" /* mm0 = INT16(FiltPtr) */ |
124 |
" punpcklbw %%mm7, %%mm1 \n\t" /* mm1 = INT16(ReconPtr1) */ |
125 |
" punpcklbw %%mm7, %%mm4 \n\t" /* mm1 = INT16(ReconPtr2) */ |
126 |
" punpckhbw %%mm7, %%mm2 \n\t" /* mm2 = INT16(FiltPtr) */ |
127 |
" punpckhbw %%mm7, %%mm3 \n\t" /* mm3 = INT16(ReconPtr1) */ |
128 |
" punpckhbw %%mm7, %%mm5 \n\t" /* mm3 = INT16(ReconPtr2) */ |
129 |
/* average ReconPtr1 and ReconPtr2 */ |
130 |
" paddw %%mm4, %%mm1 \n\t" /* mm1 = ReconPtr1 + ReconPtr2 */ |
131 |
" paddw %%mm5, %%mm3 \n\t" /* mm3 = ReconPtr1 + ReconPtr2 */ |
132 |
" psrlw $1, %%mm1 \n\t" /* mm1 = (ReconPtr1 + ReconPtr2) / 2 */ |
133 |
" psrlw $1, %%mm3 \n\t" /* mm3 = (ReconPtr1 + ReconPtr2) / 2 */ |
134 |
" psubw %%mm1, %%mm0 \n\t" /* mm0 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ |
135 |
" psubw %%mm3, %%mm2 \n\t" /* mm2 = FiltPtr - ((ReconPtr1 + ReconPtr2) / 2) */ |
136 |
" movq %%mm0, (%3) \n\t" /* write answer out */ |
137 |
" movq %%mm2, 8(%3) \n\t" /* write answer out */ |
138 |
/* Increment pointers */ |
139 |
" add $16, %3 \n\t" |
140 |
" add %4, %0 \n\t" |
141 |
" add %5, %1 \n\t" |
142 |
" add %5, %2 \n\t" |
143 |
".endr \n\t" |
144 |
|
145 |
: "+r" (FiltPtr), |
146 |
"+r" (ReconPtr1), |
147 |
"+r" (ReconPtr2), |
148 |
"+r" (DctInputPtr) |
149 |
: "m" (PixelsPerLine), |
150 |
"m" (ReconPixelsPerLine) |
151 |
: "memory" |
152 |
); |
153 |
} |
154 |
|
155 |
static ogg_uint32_t row_sad8__mmx (unsigned char *Src1, unsigned char *Src2) |
156 |
{ |
157 |
ogg_uint32_t MaxSad; |
158 |
|
159 |
__asm__ __volatile__ ( |
160 |
" .balign 16 \n\t" |
161 |
|
162 |
" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ |
163 |
" pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */ |
164 |
" movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
165 |
" movq (%2), %%mm1 \n\t" |
166 |
|
167 |
" movq %%mm0, %%mm2 \n\t" |
168 |
" psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
169 |
" psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
170 |
" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
171 |
|
172 |
" movq %%mm0, %%mm1 \n\t" |
173 |
|
174 |
" punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */ |
175 |
" punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */ |
176 |
|
177 |
" movq %%mm0, %%mm2 \n\t" |
178 |
" movq %%mm1, %%mm3 \n\t" |
179 |
" psrlq $32, %%mm2 \n\t" /* fold and add */ |
180 |
" psrlq $32, %%mm3 \n\t" |
181 |
" paddw %%mm2, %%mm0 \n\t" |
182 |
" paddw %%mm3, %%mm1 \n\t" |
183 |
" movq %%mm0, %%mm2 \n\t" |
184 |
" movq %%mm1, %%mm3 \n\t" |
185 |
" psrlq $16, %%mm2 \n\t" |
186 |
" psrlq $16, %%mm3 \n\t" |
187 |
" paddw %%mm2, %%mm0 \n\t" |
188 |
" paddw %%mm3, %%mm1 \n\t" |
189 |
|
190 |
" psubusw %%mm0, %%mm1 \n\t" |
191 |
" paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */ |
192 |
" movd %%mm1, %0 \n\t" |
193 |
" andl $0xffff, %0 \n\t" |
194 |
|
195 |
: "=m" (MaxSad), |
196 |
"+r" (Src1), |
197 |
"+r" (Src2) |
198 |
: |
199 |
: "memory" |
200 |
); |
201 |
return MaxSad; |
202 |
} |
203 |
|
204 |
static ogg_uint32_t col_sad8x8__mmx (unsigned char *Src1, unsigned char *Src2, |
205 |
ogg_uint32_t stride) |
206 |
{ |
207 |
ogg_uint32_t MaxSad; |
208 |
|
209 |
__asm__ __volatile__ ( |
210 |
" .balign 16 \n\t" |
211 |
|
212 |
" pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ |
213 |
" pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ |
214 |
" pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ |
215 |
" pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ |
216 |
" pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ |
217 |
" mov $4, %%edi \n\t" /* 4 rows */ |
218 |
"1: \n\t" |
219 |
" movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
220 |
" movq (%2), %%mm1 \n\t" /* take 8 bytes */ |
221 |
|
222 |
" movq %%mm0, %%mm2 \n\t" |
223 |
" psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
224 |
" psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
225 |
" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
226 |
" movq %%mm0, %%mm1 \n\t" |
227 |
|
228 |
" punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
229 |
" paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ |
230 |
" punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
231 |
" paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ |
232 |
" add %3, %1 \n\t" /* Inc pointer into the new data */ |
233 |
" add %3, %2 \n\t" /* Inc pointer into the new data */ |
234 |
|
235 |
" dec %%edi \n\t" |
236 |
" jnz 1b \n\t" |
237 |
|
238 |
" mov $4, %%edi \n\t" /* 4 rows */ |
239 |
"2: \n\t" |
240 |
" movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
241 |
" movq (%2), %%mm1 \n\t" /* take 8 bytes */ |
242 |
|
243 |
" movq %%mm0, %%mm2 \n\t" |
244 |
" psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
245 |
" psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
246 |
" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
247 |
" movq %%mm0, %%mm1 \n\t" |
248 |
|
249 |
" punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
250 |
" paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ |
251 |
" punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
252 |
" paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ |
253 |
" add %3, %1 \n\t" /* Inc pointer into the new data */ |
254 |
" add %3, %2 \n\t" /* Inc pointer into the new data */ |
255 |
|
256 |
" dec %%edi \n\t" |
257 |
" jnz 2b \n\t" |
258 |
|
259 |
" psubusw %%mm6, %%mm7 \n\t" |
260 |
" paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */ |
261 |
" psubusw %%mm4, %%mm5 \n\t" |
262 |
" paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */ |
263 |
" psubusw %%mm5, %%mm7 \n\t" |
264 |
" paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ |
265 |
" movq %%mm7, %%mm6 \n\t" |
266 |
" psrlq $32, %%mm6 \n\t" |
267 |
" psubusw %%mm6, %%mm7 \n\t" |
268 |
" paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ |
269 |
" movq %%mm7, %%mm6 \n\t" |
270 |
" psrlq $16, %%mm6 \n\t" |
271 |
" psubusw %%mm6, %%mm7 \n\t" |
272 |
" paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ |
273 |
" movd %%mm7, %0 \n\t" |
274 |
" andl $0xffff, %0 \n\t" |
275 |
|
276 |
: "=r" (MaxSad), |
277 |
"+r" (Src1), |
278 |
"+r" (Src2) |
279 |
: "r" (stride) |
280 |
: "memory", "edi" |
281 |
); |
282 |
|
283 |
return MaxSad; |
284 |
} |
285 |
|
286 |
static ogg_uint32_t sad8x8__mmx (unsigned char *ptr1, ogg_uint32_t stride1, |
287 |
unsigned char *ptr2, ogg_uint32_t stride2) |
288 |
{ |
289 |
ogg_uint32_t DiffVal; |
290 |
|
291 |
__asm__ __volatile__ ( |
292 |
" .balign 16 \n\t" |
293 |
" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ |
294 |
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ |
295 |
".rept 8 \n\t" |
296 |
" movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
297 |
" movq (%2), %%mm1 \n\t" |
298 |
" movq %%mm0, %%mm2 \n\t" |
299 |
|
300 |
" psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
301 |
" psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
302 |
" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
303 |
" movq %%mm0, %%mm1 \n\t" |
304 |
|
305 |
" punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
306 |
" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ |
307 |
" punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
308 |
" add %3, %1 \n\t" /* Inc pointer into the new data */ |
309 |
" paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ |
310 |
" add %4, %2 \n\t" /* Inc pointer into ref data */ |
311 |
".endr \n\t" |
312 |
|
313 |
" movq %%mm7, %%mm0 \n\t" |
314 |
" psrlq $32, %%mm7 \n\t" |
315 |
" paddw %%mm0, %%mm7 \n\t" |
316 |
" movq %%mm7, %%mm0 \n\t" |
317 |
" psrlq $16, %%mm7 \n\t" |
318 |
" paddw %%mm0, %%mm7 \n\t" |
319 |
" movd %%mm7, %0 \n\t" |
320 |
" andl $0xffff, %0 \n\t" |
321 |
|
322 |
: "=m" (DiffVal), |
323 |
"+r" (ptr1), |
324 |
"+r" (ptr2) |
325 |
: "r" (stride1), |
326 |
"r" (stride2) |
327 |
: "memory" |
328 |
); |
329 |
|
330 |
return DiffVal; |
331 |
} |
332 |
|
333 |
static ogg_uint32_t sad8x8_thres__mmx (unsigned char *ptr1, ogg_uint32_t stride1, |
334 |
unsigned char *ptr2, ogg_uint32_t stride2, |
335 |
ogg_uint32_t thres) |
336 |
{ |
337 |
return sad8x8__mmx (ptr1, stride1, ptr2, stride2); |
338 |
} |
339 |
|
340 |
static ogg_uint32_t sad8x8_xy2_thres__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, |
341 |
unsigned char *RefDataPtr1, |
342 |
unsigned char *RefDataPtr2, ogg_uint32_t RefStride, |
343 |
ogg_uint32_t thres) |
344 |
{ |
345 |
ogg_uint32_t DiffVal; |
346 |
|
347 |
__asm__ __volatile__ ( |
348 |
" .balign 16 \n\t" |
349 |
|
350 |
" pcmpeqd %%mm5, %%mm5 \n\t" /* fefefefefefefefe in mm5 */ |
351 |
" paddb %%mm5, %%mm5 \n\t" |
352 |
|
353 |
" pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ |
354 |
" pxor %%mm7, %%mm7 \n\t" /* mm7 contains the result */ |
355 |
" mov $8, %%edi \n\t" /* 8 rows */ |
356 |
"1: \n\t" |
357 |
" movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
358 |
|
359 |
" movq (%2), %%mm2 \n\t" |
360 |
" movq (%3), %%mm3 \n\t" /* take average of mm2 and mm3 */ |
361 |
" movq %%mm2, %%mm1 \n\t" |
362 |
" pand %%mm3, %%mm1 \n\t" |
363 |
" pxor %%mm2, %%mm3 \n\t" |
364 |
" pand %%mm5, %%mm3 \n\t" |
365 |
" psrlq $1, %%mm3 \n\t" |
366 |
" paddb %%mm3, %%mm1 \n\t" |
367 |
|
368 |
" movq %%mm0, %%mm2 \n\t" |
369 |
|
370 |
" psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
371 |
" psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
372 |
" por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
373 |
" movq %%mm0, %%mm1 \n\t" |
374 |
|
375 |
" punpcklbw %%mm6, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
376 |
" paddw %%mm0, %%mm7 \n\t" /* accumulate difference... */ |
377 |
" punpckhbw %%mm6, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
378 |
" add %4, %1 \n\t" /* Inc pointer into the new data */ |
379 |
" paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ |
380 |
" add %5, %2 \n\t" /* Inc pointer into ref data */ |
381 |
" add %5, %3 \n\t" /* Inc pointer into ref data */ |
382 |
|
383 |
" dec %%edi \n\t" |
384 |
" jnz 1b \n\t" |
385 |
|
386 |
" movq %%mm7, %%mm0 \n\t" |
387 |
" psrlq $32, %%mm7 \n\t" |
388 |
" paddw %%mm0, %%mm7 \n\t" |
389 |
" movq %%mm7, %%mm0 \n\t" |
390 |
" psrlq $16, %%mm7 \n\t" |
391 |
" paddw %%mm0, %%mm7 \n\t" |
392 |
" movd %%mm7, %0 \n\t" |
393 |
" andl $0xffff, %0 \n\t" |
394 |
|
395 |
: "=m" (DiffVal), |
396 |
"+r" (SrcData), |
397 |
"+r" (RefDataPtr1), |
398 |
"+r" (RefDataPtr2) |
399 |
: "m" (SrcStride), |
400 |
"m" (RefStride) |
401 |
: "edi", "memory" |
402 |
); |
403 |
|
404 |
return DiffVal; |
405 |
} |
406 |
|
407 |
static ogg_uint32_t intra8x8_err__mmx (unsigned char *DataPtr, ogg_uint32_t Stride) |
408 |
{ |
409 |
ogg_uint32_t XSum; |
410 |
ogg_uint32_t XXSum; |
411 |
|
412 |
__asm__ __volatile__ ( |
413 |
" .balign 16 \n\t" |
414 |
|
415 |
" pxor %%mm5, %%mm5 \n\t" |
416 |
" pxor %%mm6, %%mm6 \n\t" |
417 |
" pxor %%mm7, %%mm7 \n\t" |
418 |
" mov $8, %%edi \n\t" |
419 |
"1: \n\t" |
420 |
" movq (%2), %%mm0 \n\t" /* take 8 bytes */ |
421 |
" movq %%mm0, %%mm2 \n\t" |
422 |
|
423 |
" punpcklbw %%mm6, %%mm0 \n\t" |
424 |
" punpckhbw %%mm6, %%mm2 \n\t" |
425 |
|
426 |
" paddw %%mm0, %%mm5 \n\t" |
427 |
" paddw %%mm2, %%mm5 \n\t" |
428 |
|
429 |
" pmaddwd %%mm0, %%mm0 \n\t" |
430 |
" pmaddwd %%mm2, %%mm2 \n\t" |
431 |
|
432 |
" paddd %%mm0, %%mm7 \n\t" |
433 |
" paddd %%mm2, %%mm7 \n\t" |
434 |
|
435 |
" add %3, %2 \n\t" /* Inc pointer into src data */ |
436 |
|
437 |
" dec %%edi \n\t" |
438 |
" jnz 1b \n\t" |
439 |
|
440 |
" movq %%mm5, %%mm0 \n\t" |
441 |
" psrlq $32, %%mm5 \n\t" |
442 |
" paddw %%mm0, %%mm5 \n\t" |
443 |
" movq %%mm5, %%mm0 \n\t" |
444 |
" psrlq $16, %%mm5 \n\t" |
445 |
" paddw %%mm0, %%mm5 \n\t" |
446 |
" movd %%mm5, %%edi \n\t" |
447 |
" movsx %%di, %%edi \n\t" |
448 |
" movl %%edi, %0 \n\t" |
449 |
|
450 |
" movq %%mm7, %%mm0 \n\t" |
451 |
" psrlq $32, %%mm7 \n\t" |
452 |
" paddd %%mm0, %%mm7 \n\t" |
453 |
" movd %%mm7, %1 \n\t" |
454 |
|
455 |
: "=r" (XSum), |
456 |
"=r" (XXSum), |
457 |
"+r" (DataPtr) |
458 |
: "r" (Stride) |
459 |
: "edi", "memory" |
460 |
); |
461 |
|
462 |
/* Compute population variance as mis-match metric. */ |
463 |
return (( (XXSum<<6) - XSum*XSum ) ); |
464 |
} |
465 |
|
466 |
static ogg_uint32_t inter8x8_err__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, |
467 |
unsigned char *RefDataPtr, ogg_uint32_t RefStride) |
468 |
{ |
469 |
ogg_uint32_t XSum; |
470 |
ogg_uint32_t XXSum; |
471 |
|
472 |
__asm__ __volatile__ ( |
473 |
" .balign 16 \n\t" |
474 |
|
475 |
" pxor %%mm5, %%mm5 \n\t" |
476 |
" pxor %%mm6, %%mm6 \n\t" |
477 |
" pxor %%mm7, %%mm7 \n\t" |
478 |
" mov $8, %%edi \n\t" |
479 |
"1: \n\t" |
480 |
" movq (%2), %%mm0 \n\t" /* take 8 bytes */ |
481 |
" movq (%3), %%mm1 \n\t" |
482 |
" movq %%mm0, %%mm2 \n\t" |
483 |
" movq %%mm1, %%mm3 \n\t" |
484 |
|
485 |
" punpcklbw %%mm6, %%mm0 \n\t" |
486 |
" punpcklbw %%mm6, %%mm1 \n\t" |
487 |
" punpckhbw %%mm6, %%mm2 \n\t" |
488 |
" punpckhbw %%mm6, %%mm3 \n\t" |
489 |
|
490 |
" psubsw %%mm1, %%mm0 \n\t" |
491 |
" psubsw %%mm3, %%mm2 \n\t" |
492 |
|
493 |
" paddw %%mm0, %%mm5 \n\t" |
494 |
" paddw %%mm2, %%mm5 \n\t" |
495 |
|
496 |
" pmaddwd %%mm0, %%mm0 \n\t" |
497 |
" pmaddwd %%mm2, %%mm2 \n\t" |
498 |
|
499 |
" paddd %%mm0, %%mm7 \n\t" |
500 |
" paddd %%mm2, %%mm7 \n\t" |
501 |
|
502 |
" add %4, %2 \n\t" /* Inc pointer into src data */ |
503 |
" add %5, %3 \n\t" /* Inc pointer into ref data */ |
504 |
|
505 |
" dec %%edi \n\t" |
506 |
" jnz 1b \n\t" |
507 |
|
508 |
" movq %%mm5, %%mm0 \n\t" |
509 |
" psrlq $32, %%mm5 \n\t" |
510 |
" paddw %%mm0, %%mm5 \n\t" |
511 |
" movq %%mm5, %%mm0 \n\t" |
512 |
" psrlq $16, %%mm5 \n\t" |
513 |
" paddw %%mm0, %%mm5 \n\t" |
514 |
" movd %%mm5, %%edi \n\t" |
515 |
" movsx %%di, %%edi \n\t" |
516 |
" movl %%edi, %0 \n\t" |
517 |
|
518 |
" movq %%mm7, %%mm0 \n\t" |
519 |
" psrlq $32, %%mm7 \n\t" |
520 |
" paddd %%mm0, %%mm7 \n\t" |
521 |
" movd %%mm7, %1 \n\t" |
522 |
|
523 |
: "=m" (XSum), |
524 |
"=m" (XXSum), |
525 |
"+r" (SrcData), |
526 |
"+r" (RefDataPtr) |
527 |
: "m" (SrcStride), |
528 |
"m" (RefStride) |
529 |
: "edi", "memory" |
530 |
); |
531 |
|
532 |
/* Compute and return population variance as mis-match metric. */ |
533 |
return (( (XXSum<<6) - XSum*XSum )); |
534 |
} |
535 |
|
536 |
static ogg_uint32_t inter8x8_err_xy2__mmx (unsigned char *SrcData, ogg_uint32_t SrcStride, |
537 |
unsigned char *RefDataPtr1, |
538 |
unsigned char *RefDataPtr2, ogg_uint32_t RefStride) |
539 |
{ |
540 |
ogg_uint32_t XSum; |
541 |
ogg_uint32_t XXSum; |
542 |
|
543 |
__asm__ __volatile__ ( |
544 |
" .balign 16 \n\t" |
545 |
|
546 |
" pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */ |
547 |
" paddb %%mm4, %%mm4 \n\t" |
548 |
" pxor %%mm5, %%mm5 \n\t" |
549 |
" pxor %%mm6, %%mm6 \n\t" |
550 |
" pxor %%mm7, %%mm7 \n\t" |
551 |
" mov $8, %%edi \n\t" |
552 |
"1: \n\t" |
553 |
" movq (%2), %%mm0 \n\t" /* take 8 bytes */ |
554 |
|
555 |
" movq (%3), %%mm2 \n\t" |
556 |
" movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */ |
557 |
" movq %%mm2, %%mm1 \n\t" |
558 |
" pand %%mm3, %%mm1 \n\t" |
559 |
" pxor %%mm2, %%mm3 \n\t" |
560 |
" pand %%mm4, %%mm3 \n\t" |
561 |
" psrlq $1, %%mm3 \n\t" |
562 |
" paddb %%mm3, %%mm1 \n\t" |
563 |
|
564 |
" movq %%mm0, %%mm2 \n\t" |
565 |
" movq %%mm1, %%mm3 \n\t" |
566 |
|
567 |
" punpcklbw %%mm6, %%mm0 \n\t" |
568 |
" punpcklbw %%mm6, %%mm1 \n\t" |
569 |
" punpckhbw %%mm6, %%mm2 \n\t" |
570 |
" punpckhbw %%mm6, %%mm3 \n\t" |
571 |
|
572 |
" psubsw %%mm1, %%mm0 \n\t" |
573 |
" psubsw %%mm3, %%mm2 \n\t" |
574 |
|
575 |
" paddw %%mm0, %%mm5 \n\t" |
576 |
" paddw %%mm2, %%mm5 \n\t" |
577 |
|
578 |
" pmaddwd %%mm0, %%mm0 \n\t" |
579 |
" pmaddwd %%mm2, %%mm2 \n\t" |
580 |
|
581 |
" paddd %%mm0, %%mm7 \n\t" |
582 |
" paddd %%mm2, %%mm7 \n\t" |
583 |
|
584 |
" add %5, %2 \n\t" /* Inc pointer into src data */ |
585 |
" add %6, %3 \n\t" /* Inc pointer into ref data */ |
586 |
" add %6, %4 \n\t" /* Inc pointer into ref data */ |
587 |
|
588 |
" dec %%edi \n\t" |
589 |
" jnz 1b \n\t" |
590 |
|
591 |
" movq %%mm5, %%mm0 \n\t" |
592 |
" psrlq $32, %%mm5 \n\t" |
593 |
" paddw %%mm0, %%mm5 \n\t" |
594 |
" movq %%mm5, %%mm0 \n\t" |
595 |
" psrlq $16, %%mm5 \n\t" |
596 |
" paddw %%mm0, %%mm5 \n\t" |
597 |
" movd %%mm5, %%edi \n\t" |
598 |
" movsx %%di, %%edi \n\t" |
599 |
" movl %%edi, %0 \n\t" |
600 |
|
601 |
" movq %%mm7, %%mm0 \n\t" |
602 |
" psrlq $32, %%mm7 \n\t" |
603 |
" paddd %%mm0, %%mm7 \n\t" |
604 |
" movd %%mm7, %1 \n\t" |
605 |
|
606 |
: "=m" (XSum), |
607 |
"=m" (XXSum), |
608 |
"+r" (SrcData), |
609 |
"+r" (RefDataPtr1), |
610 |
"+r" (RefDataPtr2) |
611 |
: "m" (SrcStride), |
612 |
"m" (RefStride) |
613 |
: "edi", "memory" |
614 |
); |
615 |
|
616 |
/* Compute and return population variance as mis-match metric. */ |
617 |
return (( (XXSum<<6) - XSum*XSum )); |
618 |
} |
619 |
|
620 |
static void restore_fpu (void) |
621 |
{ |
622 |
__asm__ __volatile__ ( |
623 |
" emms \n\t" |
624 |
); |
625 |
} |
626 |
|
627 |
void dsp_i386_mmx_init(DspFunctions *funcs) |
628 |
{ |
629 |
funcs->restore_fpu = restore_fpu; |
630 |
funcs->sub8x8 = sub8x8__mmx; |
631 |
funcs->sub8x8_128 = sub8x8_128__mmx; |
632 |
funcs->sub8x8avg2 = sub8x8avg2__mmx; |
633 |
funcs->row_sad8 = row_sad8__mmx; |
634 |
funcs->col_sad8x8 = col_sad8x8__mmx; |
635 |
funcs->sad8x8 = sad8x8__mmx; |
636 |
funcs->sad8x8_thres = sad8x8_thres__mmx; |
637 |
funcs->sad8x8_xy2_thres = sad8x8_xy2_thres__mmx; |
638 |
funcs->intra8x8_err = intra8x8_err__mmx; |
639 |
funcs->inter8x8_err = inter8x8_err__mmx; |
640 |
funcs->inter8x8_err_xy2 = inter8x8_err_xy2__mmx; |
641 |
} |
642 |
|