Lines 122-128
coeffs:
Link Here
|
122 |
movq mm1,[src4] ; R6 R2 r6 r2 |
122 |
movq mm1,[src4] ; R6 R2 r6 r2 |
123 |
movq mm2,[src1] ; R3 R1 r3 r1 |
123 |
movq mm2,[src1] ; R3 R1 r3 r1 |
124 |
movq mm3,[src5] ; R7 R5 r7 r5 |
124 |
movq mm3,[src5] ; R7 R5 r7 r5 |
125 |
movq mm4,[wm1010] |
125 |
movq mm4,[ebx + wm1010 wrt ..gotoff] |
126 |
pand mm4,mm0 |
126 |
pand mm4,mm0 |
127 |
por mm4,mm1 |
127 |
por mm4,mm1 |
128 |
por mm4,mm2 |
128 |
por mm4,mm2 |
Lines 131-159
coeffs:
Link Here
|
131 |
movd eax,mm4 |
131 |
movd eax,mm4 |
132 |
or eax,eax |
132 |
or eax,eax |
133 |
jz near .skip1 |
133 |
jz near .skip1 |
134 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
134 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
135 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
135 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
136 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
136 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
137 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
137 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
138 |
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
138 |
movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
139 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
139 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
140 |
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
140 |
movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 |
141 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
141 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
142 |
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
142 |
movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 |
143 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
143 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
144 |
rounder_op mm4, rounder_arg |
144 |
rounder_op mm4, rounder_arg |
145 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
145 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
146 |
paddd mm4,mm5 ; A0 a0 |
146 |
paddd mm4,mm5 ; A0 a0 |
147 |
psubd mm6,mm5 ; A3 a3 |
147 |
psubd mm6,mm5 ; A3 a3 |
148 |
movq mm5,[coeffs+56] ; C7 C5 C7 C5 |
148 |
movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 |
149 |
pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 |
149 |
pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 |
150 |
rounder_op mm0, rounder_arg |
150 |
rounder_op mm0, rounder_arg |
151 |
paddd mm1,mm0 ; A1 a1 |
151 |
paddd mm1,mm0 ; A1 a1 |
152 |
paddd mm0,mm0 |
152 |
paddd mm0,mm0 |
153 |
psubd mm0,mm1 ; A2 a2 |
153 |
psubd mm0,mm1 ; A2 a2 |
154 |
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
154 |
pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 |
155 |
paddd mm7,mm5 ; B0 b0 |
155 |
paddd mm7,mm5 ; B0 b0 |
156 |
movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 |
156 |
movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 |
157 |
pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
157 |
pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
158 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
158 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
159 |
paddd mm4,mm4 ; 2A0 2a0 |
159 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 170-183
coeffs:
Link Here
|
170 |
packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 |
170 |
packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 |
171 |
movq [dst],mm7 |
171 |
movq [dst],mm7 |
172 |
movq mm1,[src1] ; R3 R1 r3 r1 |
172 |
movq mm1,[src1] ; R3 R1 r3 r1 |
173 |
movq mm4,[coeffs+80] ;-C1 C5 -C1 C5 |
173 |
movq mm4,[ebx + coeffs+80 wrt ..gotoff] ;-C1 C5 -C1 C5 |
174 |
movq [dst + 24],mm2 |
174 |
movq [dst + 24],mm2 |
175 |
pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 |
175 |
pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 |
176 |
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
176 |
movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 |
177 |
pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
177 |
pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 |
178 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
178 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
179 |
movq mm2,mm0 ; A2 a2 |
179 |
movq mm2,mm0 ; A2 a2 |
180 |
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
180 |
pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 |
181 |
paddd mm4,mm7 ; B2 b2 |
181 |
paddd mm4,mm7 ; B2 b2 |
182 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
182 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
183 |
psubd mm0,mm4 ; a2-B2 a2-b2 |
183 |
psubd mm0,mm4 ; a2-B2 a2-b2 |
Lines 196-202
coeffs:
Link Here
|
196 |
jmp short .skip2 |
196 |
jmp short .skip2 |
197 |
.skip1 |
197 |
.skip1 |
198 |
pslld mm0,16 |
198 |
pslld mm0,16 |
199 |
paddd mm0,[d40000] |
199 |
paddd mm0,[ebx + d40000 wrt ..gotoff] |
200 |
psrad mm0,13 |
200 |
psrad mm0,13 |
201 |
packssdw mm0,mm0 |
201 |
packssdw mm0,mm0 |
202 |
movq [ dst ],mm0 |
202 |
movq [ dst ],mm0 |
Lines 240-268
coeffs:
Link Here
|
240 |
movd eax,mm4 |
240 |
movd eax,mm4 |
241 |
or eax,eax |
241 |
or eax,eax |
242 |
jz near bt |
242 |
jz near bt |
243 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
243 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
244 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
244 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
245 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
245 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
246 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
246 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
247 |
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
247 |
movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
248 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
248 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
249 |
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
249 |
movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 |
250 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
250 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
251 |
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
251 |
movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 |
252 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
252 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
253 |
rounder_op mm4, rounder_arg |
253 |
rounder_op mm4, rounder_arg |
254 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
254 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
255 |
paddd mm4,mm5 ; A0 a0 |
255 |
paddd mm4,mm5 ; A0 a0 |
256 |
psubd mm6,mm5 ; A3 a3 |
256 |
psubd mm6,mm5 ; A3 a3 |
257 |
movq mm5,[coeffs+56] ; C7 C5 C7 C5 |
257 |
movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 |
258 |
pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 |
258 |
pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 |
259 |
rounder_op mm0, rounder_arg |
259 |
rounder_op mm0, rounder_arg |
260 |
paddd mm1,mm0 ; A1 a1 |
260 |
paddd mm1,mm0 ; A1 a1 |
261 |
paddd mm0,mm0 |
261 |
paddd mm0,mm0 |
262 |
psubd mm0,mm1 ; A2 a2 |
262 |
psubd mm0,mm1 ; A2 a2 |
263 |
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
263 |
pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 |
264 |
paddd mm7,mm5 ; B0 b0 |
264 |
paddd mm7,mm5 ; B0 b0 |
265 |
movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 |
265 |
movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 |
266 |
pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
266 |
pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
267 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
267 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
268 |
paddd mm4,mm4 ; 2A0 2a0 |
268 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 279-292
coeffs:
Link Here
|
279 |
packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 |
279 |
packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 |
280 |
movq [ dst ],mm7 |
280 |
movq [ dst ],mm7 |
281 |
movq mm1,[src1] ; R3 R1 r3 r1 |
281 |
movq mm1,[src1] ; R3 R1 r3 r1 |
282 |
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
282 |
movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 |
283 |
movq [ dst + 24 ],mm2 |
283 |
movq [ dst + 24 ],mm2 |
284 |
pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 |
284 |
pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 |
285 |
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
285 |
movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 |
286 |
pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
286 |
pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 |
287 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
287 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
288 |
movq mm2,mm0 ; A2 a2 |
288 |
movq mm2,mm0 ; A2 a2 |
289 |
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
289 |
pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 |
290 |
paddd mm4,mm7 ; B2 b2 |
290 |
paddd mm4,mm7 ; B2 b2 |
291 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
291 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
292 |
psubd mm0,mm4 ; a2-B2 a2-b2 |
292 |
psubd mm0,mm4 ; a2-B2 a2-b2 |
Lines 330-346
coeffs:
Link Here
|
330 |
movq mm1,[src4] ; R6 R2 r6 r2 |
330 |
movq mm1,[src4] ; R6 R2 r6 r2 |
331 |
movq mm2,[src1] ; R3 R1 r3 r1 |
331 |
movq mm2,[src1] ; R3 R1 r3 r1 |
332 |
movq mm3,[src5] ; R7 R5 r7 r5 |
332 |
movq mm3,[src5] ; R7 R5 r7 r5 |
333 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
333 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
334 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
334 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
335 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
335 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
336 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
336 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
337 |
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
337 |
movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
338 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
338 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
339 |
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
339 |
movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 |
340 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
340 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
341 |
; rounder_op mm4, rounder_arg |
341 |
; rounder_op mm4, rounder_arg |
342 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
342 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
343 |
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
343 |
movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 |
344 |
; rounder_op mm0, rounder_arg |
344 |
; rounder_op mm0, rounder_arg |
345 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
345 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
346 |
paddd mm4,mm5 ; A0 a0 |
346 |
paddd mm4,mm5 ; A0 a0 |
Lines 348-358
coeffs:
Link Here
|
348 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
348 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
349 |
paddd mm0,mm1 ; A1 a1 |
349 |
paddd mm0,mm1 ; A1 a1 |
350 |
psubd mm5,mm1 ; A2 a2 |
350 |
psubd mm5,mm1 ; A2 a2 |
351 |
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
351 |
movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 |
352 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
352 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
353 |
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
353 |
pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 |
354 |
paddd mm7,mm1 ; B0 b0 |
354 |
paddd mm7,mm1 ; B0 b0 |
355 |
movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 |
355 |
movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 |
356 |
pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
356 |
pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
357 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
357 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
358 |
paddd mm4,mm4 ; 2A0 2a0 |
358 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 374-386
coeffs:
Link Here
|
374 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
374 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
375 |
movd [ dst + 112],mm4 |
375 |
movd [ dst + 112],mm4 |
376 |
movq mm0,[src1] ; R3 R1 r3 r1 |
376 |
movq mm0,[src1] ; R3 R1 r3 r1 |
377 |
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
377 |
movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 |
378 |
pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 |
378 |
pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 |
379 |
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
379 |
movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 |
380 |
pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
380 |
pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 |
381 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
381 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
382 |
movq mm2,mm5 ; A2 a2 |
382 |
movq mm2,mm5 ; A2 a2 |
383 |
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
383 |
pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 |
384 |
paddd mm4,mm7 ; B2 b2 |
384 |
paddd mm4,mm7 ; B2 b2 |
385 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
385 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
386 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
386 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
Lines 426-438
coeffs:
Link Here
|
426 |
movq mm0,[src0] ; R4 R0 r4 r0 |
426 |
movq mm0,[src0] ; R4 R0 r4 r0 |
427 |
movq mm1,[src4] ; R6 R2 r6 r2 |
427 |
movq mm1,[src4] ; R6 R2 r6 r2 |
428 |
movq mm3,[src5] ; R7 R5 r7 r5 |
428 |
movq mm3,[src5] ; R7 R5 r7 r5 |
429 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
429 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
430 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
430 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
431 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
431 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
432 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
432 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
433 |
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
433 |
movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
434 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
434 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
435 |
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
435 |
movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 |
436 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
436 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
437 |
; rounder_op mm4, rounder_arg |
437 |
; rounder_op mm4, rounder_arg |
438 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
438 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
Lines 442-450
coeffs:
Link Here
|
442 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
442 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
443 |
paddd mm0,mm1 ; A1 a1 |
443 |
paddd mm0,mm1 ; A1 a1 |
444 |
psubd mm5,mm1 ; A2 a2 |
444 |
psubd mm5,mm1 ; A2 a2 |
445 |
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
445 |
movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 |
446 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
446 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
447 |
movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 |
447 |
movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 |
448 |
pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
448 |
pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
449 |
paddd mm1,mm4 ; A0+B0 a0+b0 |
449 |
paddd mm1,mm4 ; A0+B0 a0+b0 |
450 |
paddd mm4,mm4 ; 2A0 2a0 |
450 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 464-473
coeffs:
Link Here
|
464 |
movd [ dst + 96 ],mm2 |
464 |
movd [ dst + 96 ],mm2 |
465 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
465 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
466 |
movd [ dst + 112 ],mm4 |
466 |
movd [ dst + 112 ],mm4 |
467 |
movq mm1,[coeffs+88] ; C3 C7 C3 C7 |
467 |
movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 |
468 |
pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 |
468 |
pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 |
469 |
movq mm2,mm5 ; A2 a2 |
469 |
movq mm2,mm5 ; A2 a2 |
470 |
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
470 |
pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 |
471 |
paddd mm2,mm1 ; A2+B2 a2+b2 |
471 |
paddd mm2,mm1 ; A2+B2 a2+b2 |
472 |
psubd mm5,mm1 ; a2-B2 a2-b2 |
472 |
psubd mm5,mm1 ; a2-B2 a2-b2 |
473 |
psrad mm2,shift |
473 |
psrad mm2,shift |
Lines 510-526
coeffs:
Link Here
|
510 |
%define shift %8 |
510 |
%define shift %8 |
511 |
movq mm0,[src0] ; R4 R0 r4 r0 |
511 |
movq mm0,[src0] ; R4 R0 r4 r0 |
512 |
movq mm3,[src5] ; R7 R5 r7 r5 |
512 |
movq mm3,[src5] ; R7 R5 r7 r5 |
513 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
513 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
514 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
514 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
515 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
515 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
516 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
516 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
517 |
; rounder_op mm4, rounder_arg |
517 |
; rounder_op mm4, rounder_arg |
518 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
518 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
519 |
; rounder_op mm0, rounder_arg |
519 |
; rounder_op mm0, rounder_arg |
520 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
520 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
521 |
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
521 |
movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 |
522 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
522 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
523 |
movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 |
523 |
movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 |
524 |
pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
524 |
pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
525 |
paddd mm1,mm4 ; A0+B0 a0+b0 |
525 |
paddd mm1,mm4 ; A0+B0 a0+b0 |
526 |
paddd mm4,mm4 ; 2A0 2a0 |
526 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 540-549
coeffs:
Link Here
|
540 |
movd [ dst + 96 ],mm2 |
540 |
movd [ dst + 96 ],mm2 |
541 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
541 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
542 |
movd [ dst + 112 ],mm4 |
542 |
movd [ dst + 112 ],mm4 |
543 |
movq mm1,[coeffs+88] ; C3 C7 C3 C7 |
543 |
movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 |
544 |
pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 |
544 |
pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 |
545 |
movq mm2,mm5 ; A2 a2 |
545 |
movq mm2,mm5 ; A2 a2 |
546 |
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
546 |
pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 |
547 |
paddd mm2,mm1 ; A2+B2 a2+b2 |
547 |
paddd mm2,mm1 ; A2+B2 a2+b2 |
548 |
psubd mm5,mm1 ; a2-B2 a2-b2 |
548 |
psubd mm5,mm1 ; a2-B2 a2-b2 |
549 |
psrad mm2,shift |
549 |
psrad mm2,shift |
Lines 587-607
coeffs:
Link Here
|
587 |
movq mm0,[src0] ; R4 R0 r4 r0 |
587 |
movq mm0,[src0] ; R4 R0 r4 r0 |
588 |
movq mm2,[src1] ; R3 R1 r3 r1 |
588 |
movq mm2,[src1] ; R3 R1 r3 r1 |
589 |
movq mm3,[src5] ; R7 R5 r7 r5 |
589 |
movq mm3,[src5] ; R7 R5 r7 r5 |
590 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
590 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
591 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
591 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
592 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
592 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
593 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
593 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
594 |
; rounder_op mm4, rounder_arg |
594 |
; rounder_op mm4, rounder_arg |
595 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
595 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
596 |
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
596 |
movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 |
597 |
; rounder_op mm0, rounder_arg |
597 |
; rounder_op mm0, rounder_arg |
598 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
598 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
599 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
599 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
600 |
movq mm1,[coeffs+56] ; C7 C5 C7 C5 |
600 |
movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 |
601 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
601 |
pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 |
602 |
pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 |
602 |
pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 |
603 |
paddd mm7,mm1 ; B0 b0 |
603 |
paddd mm7,mm1 ; B0 b0 |
604 |
movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 |
604 |
movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 |
605 |
pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
605 |
pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 |
606 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
606 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
607 |
paddd mm4,mm4 ; 2A0 2a0 |
607 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 623-635
coeffs:
Link Here
|
623 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
623 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
624 |
movd [dst + 112],mm4 |
624 |
movd [dst + 112],mm4 |
625 |
movq mm0,[src1] ; R3 R1 r3 r1 |
625 |
movq mm0,[src1] ; R3 R1 r3 r1 |
626 |
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
626 |
movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 |
627 |
pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 |
627 |
pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 |
628 |
movq mm7,[coeffs+88] ; C3 C7 C3 C7 |
628 |
movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 |
629 |
pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
629 |
pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 |
630 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
630 |
pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 |
631 |
movq mm2,mm5 ; A2 a2 |
631 |
movq mm2,mm5 ; A2 a2 |
632 |
pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 |
632 |
pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 |
633 |
paddd mm4,mm7 ; B2 b2 |
633 |
paddd mm4,mm7 ; B2 b2 |
634 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
634 |
paddd mm2,mm4 ; A2+B2 a2+b2 |
635 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
635 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
Lines 674-690
coeffs:
Link Here
|
674 |
%define shift %8 |
674 |
%define shift %8 |
675 |
movq mm0,[src0] ; R4 R0 r4 r0 |
675 |
movq mm0,[src0] ; R4 R0 r4 r0 |
676 |
movq mm2,[src1] ; R3 R1 r3 r1 |
676 |
movq mm2,[src1] ; R3 R1 r3 r1 |
677 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
677 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
678 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
678 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
679 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
679 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
680 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
680 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
681 |
; rounder_op mm4, rounder_arg |
681 |
; rounder_op mm4, rounder_arg |
682 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
682 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
683 |
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
683 |
movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 |
684 |
; rounder_op mm0, rounder_arg |
684 |
; rounder_op mm0, rounder_arg |
685 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
685 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
686 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
686 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
687 |
movq mm3,[coeffs+64] |
687 |
movq mm3,[ebx + coeffs+64 wrt ..gotoff] |
688 |
pmaddwd mm3,mm2 ; -C7R3+C3R1 -C7r3+C3r1 |
688 |
pmaddwd mm3,mm2 ; -C7R3+C3R1 -C7r3+C3r1 |
689 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
689 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
690 |
paddd mm4,mm4 ; 2A0 2a0 |
690 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 704-712
coeffs:
Link Here
|
704 |
movd [dst + 96],mm1 |
704 |
movd [dst + 96],mm1 |
705 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
705 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
706 |
movd [dst + 112],mm4 |
706 |
movd [dst + 112],mm4 |
707 |
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
707 |
movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 |
708 |
pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 |
708 |
pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 |
709 |
pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
709 |
pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 |
710 |
movq mm1,mm5 ; A2 a2 |
710 |
movq mm1,mm5 ; A2 a2 |
711 |
paddd mm1,mm4 ; A2+B2 a2+b2 |
711 |
paddd mm1,mm4 ; A2+B2 a2+b2 |
712 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
712 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
Lines 750-762
coeffs:
Link Here
|
750 |
%define shift %8 |
750 |
%define shift %8 |
751 |
movq mm0,[src0] ; R4 R0 r4 r0 |
751 |
movq mm0,[src0] ; R4 R0 r4 r0 |
752 |
movq mm1,[src4] ; R6 R2 r6 r2 |
752 |
movq mm1,[src4] ; R6 R2 r6 r2 |
753 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
753 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
754 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
754 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
755 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
755 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
756 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
756 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
757 |
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
757 |
movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
758 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
758 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
759 |
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
759 |
movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 |
760 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
760 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
761 |
; rounder_op mm4, rounder_arg |
761 |
; rounder_op mm4, rounder_arg |
762 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
762 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
Lines 768-780
coeffs:
Link Here
|
768 |
psubd mm5,mm1 ; A2 a2 |
768 |
psubd mm5,mm1 ; A2 a2 |
769 |
movq mm2,[src0 + 8] ; R4 R0 r4 r0 |
769 |
movq mm2,[src0 + 8] ; R4 R0 r4 r0 |
770 |
movq mm3,[src4 + 8] ; R6 R2 r6 r2 |
770 |
movq mm3,[src4 + 8] ; R6 R2 r6 r2 |
771 |
movq mm1,[coeffs+16] ; C4 C4 C4 C4 |
771 |
movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
772 |
pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 |
772 |
pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 |
773 |
movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 |
773 |
movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
774 |
pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 |
774 |
pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 |
775 |
movq mm7,[coeffs+32] ; C6 C2 C6 C2 |
775 |
movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
776 |
pmaddwd mm7,mm3 ; C6R6+C2R2 C6r6+C2r2 |
776 |
pmaddwd mm7,mm3 ; C6R6+C2R2 C6r6+C2r2 |
777 |
pmaddwd mm3,[coeffs+40] ; -C2R6+C6R2 -C2r6+C6r2 |
777 |
pmaddwd mm3,[ebx + coeffs+40 wrt ..gotoff] ; -C2R6+C6R2 -C2r6+C6r2 |
778 |
; rounder_op mm1, rounder_arg |
778 |
; rounder_op mm1, rounder_arg |
779 |
paddd mm7,mm1 ; A0 a0 |
779 |
paddd mm7,mm1 ; A0 a0 |
780 |
paddd mm1,mm1 ; 2C0 2c0 |
780 |
paddd mm1,mm1 ; 2C0 2c0 |
Lines 829-845
coeffs:
Link Here
|
829 |
movq mm0,[src0] ; R4 R0 r4 r0 |
829 |
movq mm0,[src0] ; R4 R0 r4 r0 |
830 |
movq mm1,[src4] ; R6 R2 r6 r2 |
830 |
movq mm1,[src4] ; R6 R2 r6 r2 |
831 |
movq mm2,[src1] ; R3 R1 r3 r1 |
831 |
movq mm2,[src1] ; R3 R1 r3 r1 |
832 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
832 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
833 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
833 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
834 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
834 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
835 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
835 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
836 |
movq mm5,[coeffs+32] ; C6 C2 C6 C2 |
836 |
movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
837 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
837 |
pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 |
838 |
movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 |
838 |
movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 |
839 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
839 |
pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 |
840 |
; rounder_op mm4, rounder_arg |
840 |
; rounder_op mm4, rounder_arg |
841 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
841 |
movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 |
842 |
movq mm7,[coeffs+48] ; C3 C1 C3 C1 |
842 |
movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 |
843 |
; rounder_op mm0, rounder_arg |
843 |
; rounder_op mm0, rounder_arg |
844 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
844 |
pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 |
845 |
paddd mm4,mm5 ; A0 a0 |
845 |
paddd mm4,mm5 ; A0 a0 |
Lines 847-853
coeffs:
Link Here
|
847 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
847 |
movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 |
848 |
paddd mm0,mm1 ; A1 a1 |
848 |
paddd mm0,mm1 ; A1 a1 |
849 |
psubd mm5,mm1 ; A2 a2 |
849 |
psubd mm5,mm1 ; A2 a2 |
850 |
movq mm1,[coeffs+64] |
850 |
movq mm1,[ebx + coeffs+64 wrt ..gotoff] |
851 |
pmaddwd mm1,mm2 ; -C7R3+C3R1 -C7r3+C3r1 |
851 |
pmaddwd mm1,mm2 ; -C7R3+C3R1 -C7r3+C3r1 |
852 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
852 |
paddd mm7,mm4 ; A0+B0 a0+b0 |
853 |
paddd mm4,mm4 ; 2A0 2a0 |
853 |
paddd mm4,mm4 ; 2A0 2a0 |
Lines 867-875
coeffs:
Link Here
|
867 |
movd [dst + 96],mm3 |
867 |
movd [dst + 96],mm3 |
868 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
868 |
packssdw mm4,mm4 ; A0-B0 a0-b0 |
869 |
movd [dst + 112],mm4 |
869 |
movd [dst + 112],mm4 |
870 |
movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 |
870 |
movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 |
871 |
pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 |
871 |
pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 |
872 |
pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 |
872 |
pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 |
873 |
movq mm3,mm5 ; A2 a2 |
873 |
movq mm3,mm5 ; A2 a2 |
874 |
paddd mm3,mm4 ; A2+B2 a2+b2 |
874 |
paddd mm3,mm4 ; A2+B2 a2+b2 |
875 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
875 |
psubd mm5,mm4 ; a2-B2 a2-b2 |
Lines 912-931
coeffs:
Link Here
|
912 |
%define rounder_arg %7 |
912 |
%define rounder_arg %7 |
913 |
%define shift %8 |
913 |
%define shift %8 |
914 |
movq mm0,[src0] ; R4 R0 r4 r0 |
914 |
movq mm0,[src0] ; R4 R0 r4 r0 |
915 |
movq mm4,[coeffs+16] ; C4 C4 C4 C4 |
915 |
movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
916 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
916 |
pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 |
917 |
movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 |
917 |
movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
918 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
918 |
pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 |
919 |
; rounder_op mm4, rounder_arg |
919 |
; rounder_op mm4, rounder_arg |
920 |
; rounder_op mm0, rounder_arg |
920 |
; rounder_op mm0, rounder_arg |
921 |
psrad mm4,shift |
921 |
psrad mm4,shift |
922 |
psrad mm0,shift |
922 |
psrad mm0,shift |
923 |
movq mm2,[src0 + 8] ; R4 R0 r4 r0 |
923 |
movq mm2,[src0 + 8] ; R4 R0 r4 r0 |
924 |
movq mm1,[coeffs+16] ; C4 C4 C4 C4 |
924 |
movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 |
925 |
pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 |
925 |
pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 |
926 |
movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 |
926 |
movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 |
927 |
pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 |
927 |
pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 |
928 |
movq mm7,[coeffs+32] ; C6 C2 C6 C2 |
928 |
movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 |
929 |
; rounder_op mm1, rounder_arg |
929 |
; rounder_op mm1, rounder_arg |
930 |
; rounder_op mm2, rounder_arg |
930 |
; rounder_op mm2, rounder_arg |
931 |
psrad mm1,shift |
931 |
psrad mm1,shift |
Lines 1073-1078
coeffs:
Link Here
|
1073 |
|
1073 |
|
1074 |
SECTION .text |
1074 |
SECTION .text |
1075 |
|
1075 |
|
|
|
1076 |
extern _GLOBAL_OFFSET_TABLE_ |
1077 |
get_pc.bx: |
1078 |
mov ebx, [esp] |
1079 |
retn |
1080 |
|
1076 |
cglobal simple_idct_mmx_P |
1081 |
cglobal simple_idct_mmx_P |
1077 |
cglobal simple_idct_mmx |
1082 |
cglobal simple_idct_mmx |
1078 |
|
1083 |
|
Lines 1083-1096
cglobal simple_idct_mmx
Link Here
|
1083 |
|
1088 |
|
1084 |
ALIGN 16 |
1089 |
ALIGN 16 |
1085 |
simple_idct_mmx_P: |
1090 |
simple_idct_mmx_P: |
|
|
1091 |
push ebx |
1092 |
call get_pc.bx |
1093 |
add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc |
1094 |
|
1086 |
sub esp, 128 |
1095 |
sub esp, 128 |
1087 |
mov edx, [esp+128+4] |
1096 |
mov edx, [esp+128+4+4] |
1088 |
|
1097 |
|
1089 |
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
1098 |
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
1090 |
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 |
1099 |
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11 |
1091 |
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four |
1100 |
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .four |
1092 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two |
1101 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .two |
1093 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .one |
1102 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .one |
1094 |
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1103 |
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1095 |
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1104 |
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1096 |
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1105 |
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1099-1106
simple_idct_mmx_P:
Link Here
|
1099 |
|
1108 |
|
1100 |
ALIGN 16 |
1109 |
ALIGN 16 |
1101 |
.four |
1110 |
.four |
1102 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six |
1111 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .six |
1103 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five |
1112 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .five |
1104 |
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1113 |
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1105 |
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1114 |
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1106 |
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1115 |
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1109-1115
ALIGN 16
Link Here
|
1109 |
|
1118 |
|
1110 |
ALIGN 16 |
1119 |
ALIGN 16 |
1111 |
.six |
1120 |
.six |
1112 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven |
1121 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .seven |
1113 |
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1122 |
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1114 |
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1123 |
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1115 |
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1124 |
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1118-1124
ALIGN 16
Link Here
|
1118 |
|
1127 |
|
1119 |
ALIGN 16 |
1128 |
ALIGN 16 |
1120 |
.two |
1129 |
.two |
1121 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three |
1130 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .three |
1122 |
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1131 |
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1123 |
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1132 |
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1124 |
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1133 |
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1159-1164
ALIGN 16
Link Here
|
1159 |
.ret |
1168 |
.ret |
1160 |
add esp, 128 |
1169 |
add esp, 128 |
1161 |
|
1170 |
|
|
|
1171 |
pop ebx |
1162 |
ret |
1172 |
ret |
1163 |
.endfunc |
1173 |
.endfunc |
1164 |
|
1174 |
|
Lines 1174-1188
ALIGN 16
Link Here
|
1174 |
|
1184 |
|
1175 |
ALIGN 16 |
1185 |
ALIGN 16 |
1176 |
simple_idct_mmx: |
1186 |
simple_idct_mmx: |
|
|
1187 |
push ebx |
1188 |
call get_pc.bx |
1189 |
add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc |
1190 |
|
1177 |
sub esp, 128 |
1191 |
sub esp, 128 |
1178 |
mov edx, [esp+128+4] |
1192 |
mov edx, [esp+128+4+4] |
1179 |
PERMUTEP edx ; permute parm list in place |
1193 |
PERMUTEP edx ; permute parm list in place |
1180 |
|
1194 |
|
1181 |
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
1195 |
; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt |
1182 |
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 |
1196 |
DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11 |
1183 |
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .fourP |
1197 |
Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .fourP |
1184 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .twoP |
1198 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .twoP |
1185 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .oneP |
1199 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .oneP |
1186 |
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1200 |
IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1187 |
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1201 |
IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1188 |
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1202 |
IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1191-1198
simple_idct_mmx:
Link Here
|
1191 |
|
1205 |
|
1192 |
ALIGN 16 |
1206 |
ALIGN 16 |
1193 |
.fourP |
1207 |
.fourP |
1194 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .sixP |
1208 |
Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .sixP |
1195 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .fiveP |
1209 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .fiveP |
1196 |
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1210 |
IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1197 |
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1211 |
IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1198 |
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1212 |
IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1201-1207
ALIGN 16
Link Here
|
1201 |
|
1215 |
|
1202 |
ALIGN 16 |
1216 |
ALIGN 16 |
1203 |
.sixP |
1217 |
.sixP |
1204 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .sevenP |
1218 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .sevenP |
1205 |
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1219 |
IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1206 |
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1220 |
IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1207 |
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1221 |
IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1210-1216
ALIGN 16
Link Here
|
1210 |
|
1224 |
|
1211 |
ALIGN 16 |
1225 |
ALIGN 16 |
1212 |
.twoP |
1226 |
.twoP |
1213 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .threeP |
1227 |
Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .threeP |
1214 |
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1228 |
IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 |
1215 |
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1229 |
IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 |
1216 |
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
1230 |
IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 |
Lines 1251-1256
ALIGN 16
Link Here
|
1251 |
.retP |
1265 |
.retP |
1252 |
add esp, 128 |
1266 |
add esp, 128 |
1253 |
|
1267 |
|
|
|
1268 |
pop ebx |
1254 |
ret |
1269 |
ret |
1255 |
.endfunc |
1270 |
.endfunc |
1256 |
|
1271 |
|