Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 202465 | Differences between
and this patch

Collapse All | Expand All

(-)xvidcore-1.1.3-old/src/bitstream/x86_asm/cbp_mmx.asm (-19 / +11 lines)
Lines 50-72 BITS 32 Link Here
50
%endmacro
50
%endmacro
51
51
52
;=============================================================================
52
;=============================================================================
53
; Local data
54
;=============================================================================
55
56
%ifdef FORMAT_COFF
57
SECTION .rodata
58
%else
59
SECTION .rodata align=16
60
%endif
61
62
ALIGN 16
63
64
mult_mask:
65
  db 0x10,0x20,0x04,0x08,0x01,0x02,0x00,0x00
66
ignore_dc:
67
  dw 0, -1, -1, -1
68
69
;=============================================================================
70
; Code
53
; Code
71
;=============================================================================
54
;=============================================================================
72
55
Lines 91-97 ALIGN 16 Link Here
91
calc_cbp_mmx:
74
calc_cbp_mmx:
92
  mov eax, [esp + 4]            ; coeff
75
  mov eax, [esp + 4]            ; coeff
93
76
94
  movq mm7, [ignore_dc]
77
  push byte 0                 ; align esp to 8 bytes
78
  push byte -1
79
  push dword 0xFFFF0000
80
  movq mm7, [esp]
81
  add esp, byte 8
82
95
  pxor mm6, mm6                ; used only for comparing
83
  pxor mm6, mm6                ; used only for comparing
96
  movq mm0, [eax+128*0]
84
  movq mm0, [eax+128*0]
97
  movq mm1, [eax+128*1]
85
  movq mm1, [eax+128*1]
Lines 123-129 calc_cbp_mmx: Link Here
123
  MAKE_LOAD 13
111
  MAKE_LOAD 13
124
  MAKE_LOAD 14
112
  MAKE_LOAD 14
125
113
126
  movq mm7, [mult_mask]
114
  push dword 0x00000201
115
  push dword 0x08042010
116
  movq mm7, [esp]
117
  add esp, byte 12
118
127
  packssdw mm0, mm1
119
  packssdw mm0, mm1
128
  packssdw mm2, mm3
120
  packssdw mm2, mm3
129
  packssdw mm4, mm5
121
  packssdw mm4, mm5
(-)xvidcore-1.1.3-old/src/bitstream/x86_asm/cbp_sse2.asm (-15 / +7 lines)
Lines 69-88 BITS 32 Link Here
69
%endmacro
69
%endmacro
70
70
71
;=============================================================================
71
;=============================================================================
72
; Data (Read Only)
73
;=============================================================================
74
75
%ifdef FORMAT_COFF
76
SECTION .rodata
77
%else
78
SECTION .rodata align=16
79
%endif
80
81
ALIGN 16
82
ignore_dc:
83
  dw 0, -1, -1, -1, -1, -1, -1, -1
84
85
;=============================================================================
86
; Code
72
; Code
87
;=============================================================================
73
;=============================================================================
88
74
Lines 98-104 calc_cbp_sse2: Link Here
98
  mov edx, [esp+4]         ; coeff[]
84
  mov edx, [esp+4]         ; coeff[]
99
  xor eax, eax             ; cbp = 0
85
  xor eax, eax             ; cbp = 0
100
86
101
  movdqu xmm7, [ignore_dc] ; mask to ignore dc value
87
  sub esp,byte 12          ; align esp to 16 bytes
88
  push byte -1
89
  push byte -1
90
  push byte -1
91
  push dword 0xFFFF0000
92
  movdqu xmm7, [esp]       ; mask to ignore dc value
93
  add esp, byte 28
102
  pxor xmm6, xmm6          ; zero
94
  pxor xmm6, xmm6          ; zero
103
95
104
  LOOP_SSE2 0
96
  LOOP_SSE2 0
(-)xvidcore-1.1.3-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm (-35 / +45 lines)
Lines 204-210 fdct_r_row: Link Here
204
  psllw mm4, SHIFT_FRW_COL
204
  psllw mm4, SHIFT_FRW_COL
205
  movq mm6, mm0
205
  movq mm6, mm0
206
  psubsw mm2, mm1
206
  psubsw mm2, mm1
207
  movq mm1, [fdct_tg_all_16 + 4*2]
207
  movq mm1, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
208
  psubsw mm0, mm4
208
  psubsw mm0, mm4
209
  movq mm7, [%2 + %3*2 + 3*16]
209
  movq mm7, [%2 + %3*2 + 3*16]
210
  pmulhw mm1, mm0
210
  pmulhw mm1, mm0
Lines 216-224 fdct_r_row: Link Here
216
  psubsw mm5, mm7
216
  psubsw mm5, mm7
217
  paddsw mm1, mm5
217
  paddsw mm1, mm5
218
  paddsw mm4, mm7
218
  paddsw mm4, mm7
219
  por mm1, [fdct_one_corr]
219
  por mm1, [ebx + fdct_one_corr wrt ..gotoff]
220
  psllw mm2, SHIFT_FRW_COL + 1
220
  psllw mm2, SHIFT_FRW_COL + 1
221
  pmulhw mm5, [fdct_tg_all_16 + 4*2]
221
  pmulhw mm5, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff]
222
  movq mm7, mm4
222
  movq mm7, mm4
223
  psubsw mm3, [%2 + %3*2 + 5*16]
223
  psubsw mm3, [%2 + %3*2 + 5*16]
224
  psubsw mm4, mm6
224
  psubsw mm4, mm6
Lines 230-263 fdct_r_row: Link Here
230
  movq mm6, mm2
230
  movq mm6, mm2
231
  movq [%1 + %3*2 + 4*16], mm4
231
  movq [%1 + %3*2 + 4*16], mm4
232
  paddsw mm2, mm3
232
  paddsw mm2, mm3
233
  pmulhw mm2, [ocos_4_16]
233
  pmulhw mm2, [ebx + ocos_4_16 wrt ..gotoff]
234
  psubsw mm6, mm3
234
  psubsw mm6, mm3
235
  pmulhw mm6, [ocos_4_16]
235
  pmulhw mm6, [ebx + ocos_4_16 wrt ..gotoff]
236
  psubsw mm5, mm0
236
  psubsw mm5, mm0
237
  por mm5, [fdct_one_corr]
237
  por mm5, [ebx + fdct_one_corr wrt ..gotoff]
238
  psllw mm1, SHIFT_FRW_COL
238
  psllw mm1, SHIFT_FRW_COL
239
  por mm2, [fdct_one_corr]
239
  por mm2, [ebx + fdct_one_corr wrt ..gotoff]
240
  movq mm4, mm1
240
  movq mm4, mm1
241
  movq mm3, [%2 + %3*2 + 0*16]
241
  movq mm3, [%2 + %3*2 + 0*16]
242
  paddsw mm1, mm6
242
  paddsw mm1, mm6
243
  psubsw mm3, [%2 + %3*2 + 7*16]
243
  psubsw mm3, [%2 + %3*2 + 7*16]
244
  psubsw mm4, mm6
244
  psubsw mm4, mm6
245
  movq mm0, [fdct_tg_all_16 + 0*2]
245
  movq mm0, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
246
  psllw mm3, SHIFT_FRW_COL
246
  psllw mm3, SHIFT_FRW_COL
247
  movq mm6, [fdct_tg_all_16 + 8*2]
247
  movq mm6, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
248
  pmulhw mm0, mm1
248
  pmulhw mm0, mm1
249
  movq [%1 + %3*2 + 0*16], mm7
249
  movq [%1 + %3*2 + 0*16], mm7
250
  pmulhw mm6, mm4
250
  pmulhw mm6, mm4
251
  movq [%1 + %3*2 + 6*16], mm5
251
  movq [%1 + %3*2 + 6*16], mm5
252
  movq mm7, mm3
252
  movq mm7, mm3
253
  movq mm5, [fdct_tg_all_16 + 8*2]
253
  movq mm5, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff]
254
  psubsw mm7, mm2
254
  psubsw mm7, mm2
255
  paddsw mm3, mm2
255
  paddsw mm3, mm2
256
  pmulhw mm5, mm7
256
  pmulhw mm5, mm7
257
  paddsw mm0, mm3
257
  paddsw mm0, mm3
258
  paddsw mm6, mm4
258
  paddsw mm6, mm4
259
  pmulhw mm3, [fdct_tg_all_16 + 0*2]
259
  pmulhw mm3, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff]
260
  por mm0, [fdct_one_corr]
260
  por mm0, [ebx + fdct_one_corr wrt ..gotoff]
261
  paddsw mm5, mm7
261
  paddsw mm5, mm7
262
  psubsw mm7, mm6
262
  psubsw mm7, mm6
263
  movq [%1 + %3*2 + 1*16], mm0
263
  movq [%1 + %3*2 + 1*16], mm0
Lines 287-314 fdct_r_row: Link Here
287
  movq mm6, mm5
287
  movq mm6, mm5
288
  punpckldq mm3, mm5
288
  punpckldq mm3, mm5
289
  punpckhdq mm6, mm3
289
  punpckhdq mm6, mm3
290
  movq mm3, [%3 + 0*2]
290
  movq mm3, [0*2 + %3]
291
  movq mm4, [%3 + 4*2]
291
  movq mm4, [4*2 + %3]
292
  punpckldq mm2, mm0
292
  punpckldq mm2, mm0
293
  pmaddwd mm3, mm0
293
  pmaddwd mm3, mm0
294
  punpckhdq mm1, mm2
294
  punpckhdq mm1, mm2
295
  movq mm2, [%3 + 16*2]
295
  movq mm2, [16*2 + %3]
296
  pmaddwd mm4, mm1
296
  pmaddwd mm4, mm1
297
  pmaddwd mm0, [%3 + 8*2]
297
  pmaddwd mm0, [8*2 + %3]
298
  movq mm7, [%3 + 20*2]
298
  movq mm7, [20*2 + %3]
299
  pmaddwd mm2, mm5
299
  pmaddwd mm2, mm5
300
  paddd mm3, [fdct_r_row]
300
  paddd mm3, [ebx + fdct_r_row wrt ..gotoff]
301
  pmaddwd mm7, mm6
301
  pmaddwd mm7, mm6
302
  pmaddwd mm1, [%3 + 12*2]
302
  pmaddwd mm1, [12*2 + %3]
303
  paddd mm3, mm4
303
  paddd mm3, mm4
304
  pmaddwd mm5, [%3 + 24*2]
304
  pmaddwd mm5, [24*2 + %3]
305
  pmaddwd mm6, [%3 + 28*2]
305
  pmaddwd mm6, [28*2 + %3]
306
  paddd mm2, mm7
306
  paddd mm2, mm7
307
  paddd mm0, [fdct_r_row]
307
  paddd mm0, [ebx + fdct_r_row wrt ..gotoff]
308
  psrad mm3, SHIFT_FRW_ROW
308
  psrad mm3, SHIFT_FRW_ROW
309
  paddd mm2, [fdct_r_row]
309
  paddd mm2, [ebx + fdct_r_row wrt ..gotoff]
310
  paddd mm0, mm1
310
  paddd mm0, mm1
311
  paddd mm5, [fdct_r_row]
311
  paddd mm5, [ebx + fdct_r_row wrt ..gotoff]
312
  psrad mm2, SHIFT_FRW_ROW
312
  psrad mm2, SHIFT_FRW_ROW
313
  paddd mm5, mm6
313
  paddd mm5, mm6
314
  psrad mm0, SHIFT_FRW_ROW
314
  psrad mm0, SHIFT_FRW_ROW
Lines 336-358 fdct_r_row: Link Here
336
  psubsw mm1, mm5
336
  psubsw mm1, mm5
337
  pshufw mm2, mm0, 0x4E
337
  pshufw mm2, mm0, 0x4E
338
  pshufw mm3, mm1, 0x4E
338
  pshufw mm3, mm1, 0x4E
339
  movq mm4, [%3 +  0*2]
339
  movq mm4, [ 0*2 + %3]
340
  movq mm6, [%3 +  4*2]
340
  movq mm6, [ 4*2 + %3]
341
  movq mm5, [%3 + 16*2]
341
  movq mm5, [16*2 + %3]
342
  movq mm7, [%3 + 20*2]
342
  movq mm7, [20*2 + %3]
343
  pmaddwd mm4, mm0
343
  pmaddwd mm4, mm0
344
  pmaddwd mm5, mm1
344
  pmaddwd mm5, mm1
345
  pmaddwd mm6, mm2
345
  pmaddwd mm6, mm2
346
  pmaddwd mm7, mm3
346
  pmaddwd mm7, mm3
347
  pmaddwd mm0, [%3 +  8*2]
347
  pmaddwd mm0, [ 8*2 + %3]
348
  pmaddwd mm2, [%3 + 12*2]
348
  pmaddwd mm2, [12*2 + %3]
349
  pmaddwd mm1, [%3 + 24*2]
349
  pmaddwd mm1, [24*2 + %3]
350
  pmaddwd mm3, [%3 + 28*2]
350
  pmaddwd mm3, [28*2 + %3]
351
  paddd mm4, mm6
351
  paddd mm4, mm6
352
  paddd mm5, mm7
352
  paddd mm5, mm7
353
  paddd mm0, mm2
353
  paddd mm0, mm2
354
  paddd mm1, mm3
354
  paddd mm1, mm3
355
  movq mm7, [fdct_r_row]
355
  movq mm7, [ebx + fdct_r_row wrt ..gotoff]
356
  paddd mm4, mm7
356
  paddd mm4, mm7
357
  paddd mm5, mm7
357
  paddd mm5, mm7
358
  paddd mm0, mm7
358
  paddd mm0, mm7
Lines 377-382 cglobal %1 Link Here
377
	;; Move the destination/source address to the eax register
377
	;; Move the destination/source address to the eax register
378
  mov eax, [esp + 4]
378
  mov eax, [esp + 4]
379
379
380
  push ebx
381
  call get_pc.bx
382
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
383
380
	;; Process the columns (4 at a time)
384
	;; Process the columns (4 at a time)
381
  FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
385
  FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3
382
  FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
386
  FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7
Lines 386-397 cglobal %1 Link Here
386
%assign i 0
390
%assign i 0
387
%rep 8
391
%rep 8
388
	;; Process the 'i'th row
392
	;; Process the 'i'th row
389
  %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i
393
  %2 eax+2*i*8, eax+2*i*8, ebx + tab_frw_01234567+2*32*i wrt ..gotoff
390
	%assign i i+1
394
	%assign i i+1
391
%endrep
395
%endrep
392
%else
396
%else
393
  mov ecx, 8
397
  mov ecx, 8
394
  mov edx, tab_frw_01234567
398
  mov edx, [ebx + tab_frw_01234567 wrt ..gotoff]
395
ALIGN 8
399
ALIGN 8
396
.loop
400
.loop
397
  %2 eax, eax, edx
401
  %2 eax, eax, edx
Lines 401-406 ALIGN 8 Link Here
401
  jne .loop
405
  jne .loop
402
%endif
406
%endif
403
407
408
  pop ebx
404
  ret
409
  ret
405
.endfunc
410
.endfunc
406
%endmacro
411
%endmacro
Lines 411-416 ALIGN 8 Link Here
411
416
412
SECTION .text
417
SECTION .text
413
418
419
extern  _GLOBAL_OFFSET_TABLE_
420
get_pc.bx:
421
  mov ebx, [esp]
422
  retn
423
414
;-----------------------------------------------------------------------------
424
;-----------------------------------------------------------------------------
415
; void fdct_mmx_ffmpeg(int16_t block[64]);
425
; void fdct_mmx_ffmpeg(int16_t block[64]);
416
;-----------------------------------------------------------------------------
426
;-----------------------------------------------------------------------------
(-)xvidcore-1.1.3-old/src/dct/x86_asm/fdct_mmx_skal.asm (-32 / +42 lines)
Lines 294-308 MMX_One: Link Here
294
  paddsw mm2, mm1       ; mm2: t6+t5
294
  paddsw mm2, mm1       ; mm2: t6+t5
295
  movq [%1+0*16], mm5   ; => out0
295
  movq [%1+0*16], mm5   ; => out0
296
296
297
  movq mm4, [tan2]      ; mm4 <= tan2
297
  movq mm4, [ebx + tan2 wrt ..gotoff]      ; mm4 <= tan2
298
  pmulhw mm4, mm7       ; tm03*tan2
298
  pmulhw mm4, mm7       ; tm03*tan2
299
  movq mm5, [tan2]      ; mm5 <= tan2
299
  movq mm5, [ebx + tan2 wrt ..gotoff]      ; mm5 <= tan2
300
  psubsw mm4, mm6       ; out6 = tm03*tan2 - tm12
300
  psubsw mm4, mm6       ; out6 = tm03*tan2 - tm12
301
  pmulhw mm5, mm6       ; tm12*tan2
301
  pmulhw mm5, mm6       ; tm12*tan2
302
  paddsw mm5, mm7       ; out2 = tm12*tan2 + tm03
302
  paddsw mm5, mm7       ; out2 = tm12*tan2 + tm03
303
303
304
  movq mm6, [sqrt2]
304
  movq mm6, [ebx + sqrt2 wrt ..gotoff]
305
  movq mm7, [MMX_One]
305
  movq mm7, [ebx + MMX_One wrt ..gotoff]
306
306
307
  pmulhw mm2, mm6       ; mm2: tp65 = (t6 + t5)*cos4
307
  pmulhw mm2, mm6       ; mm2: tp65 = (t6 + t5)*cos4
308
  por mm5, mm7          ; correct out2
308
  por mm5, mm7          ; correct out2
Lines 320-327 MMX_One: Link Here
320
  paddsw mm2, mm4       ; mm2: tp765 = t7 + tp65
320
  paddsw mm2, mm4       ; mm2: tp765 = t7 + tp65
321
  paddsw mm1, mm5       ; mm1: tp465 = t4 + tm65
321
  paddsw mm1, mm5       ; mm1: tp465 = t4 + tm65
322
322
323
  movq mm4, [tan3]      ; tan3 - 1
323
  movq mm4, [ebx + tan3 wrt ..gotoff]      ; tan3 - 1
324
  movq mm5, [tan1]      ; tan1
324
  movq mm5, [ebx + tan1 wrt ..gotoff]      ; tan1
325
325
326
  movq mm7, mm3         ; save tm465
326
  movq mm7, mm3         ; save tm465
327
  pmulhw mm3, mm4       ; tm465*(tan3-1)
327
  pmulhw mm3, mm4       ; tm465*(tan3-1)
Lines 364-386 MMX_One: Link Here
364
  punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
364
  punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
365
  punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
365
  punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
366
366
367
  movq mm2, qword [%3 + 0]     ;  [   M00    M01      M16    M17]
367
  movq mm2, qword [0 + %3]     ;  [   M00    M01      M16    M17]
368
  movq mm3, qword [%3 + 8]     ;  [   M02    M03      M18    M19]
368
  movq mm3, qword [8 + %3]     ;  [   M02    M03      M18    M19]
369
  pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
369
  pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
370
  movq mm4, qword [%3 + 16]    ;  [   M04    M05      M20    M21]
370
  movq mm4, qword [16 + %3]    ;  [   M04    M05      M20    M21]
371
  pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
371
  pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
372
  movq mm5, qword [%3 + 24]    ;  [   M06    M07      M22    M23]
372
  movq mm5, qword [24 + %3]    ;  [   M06    M07      M22    M23]
373
  pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
373
  pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
374
  movq mm6, qword [%3 + 32]    ;  [   M08    M09      M24    M25]
374
  movq mm6, qword [32 + %3]    ;  [   M08    M09      M24    M25]
375
  pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
375
  pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
376
  movq mm7, qword [%3 + 40]    ;  [   M10    M11      M26    M27]
376
  movq mm7, qword [40 + %3]    ;  [   M10    M11      M26    M27]
377
  pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
377
  pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
378
  paddd mm2, mm3               ;  [ out0 | out1 ]
378
  paddd mm2, mm3               ;  [ out0 | out1 ]
379
  pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
379
  pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
380
  psrad mm2, 16
380
  psrad mm2, 16
381
  pmaddwd mm0, qword [%3 + 48] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
381
  pmaddwd mm0, qword [48 + %3] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
382
  paddd mm4, mm5               ;  [ out2 | out3 ]
382
  paddd mm4, mm5               ;  [ out2 | out3 ]
383
  pmaddwd mm1, qword [%3 + 56] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
383
  pmaddwd mm1, qword [56 + %3] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
384
  psrad mm4, 16
384
  psrad mm4, 16
385
385
386
  paddd mm6, mm7               ;  [ out4 | out5 ]
386
  paddd mm6, mm7               ;  [ out4 | out5 ]
Lines 422-444 MMX_One: Link Here
422
  punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
422
  punpckldq mm0, mm7           ; mm0 = [a0 a1 b0 b1]
423
  punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
423
  punpckhdq mm1, mm7           ; mm1 = [b2 b3 a2 a3]
424
424
425
  movq mm2, qword [%3 + 0]     ;  [   M00    M01      M16    M17]
425
  movq mm2, qword [0 + %3]     ;  [   M00    M01      M16    M17]
426
  movq mm3, qword [%3 + 8]     ;  [   M02    M03      M18    M19]
426
  movq mm3, qword [8 + %3]     ;  [   M02    M03      M18    M19]
427
  pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
427
  pmaddwd mm2, mm0             ;  [a0.M00+a1.M01 | b0.M16+b1.M17]
428
  movq mm4, qword [%3 + 16]    ;  [   M04    M05      M20    M21]
428
  movq mm4, qword [16 + %3]    ;  [   M04    M05      M20    M21]
429
  pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
429
  pmaddwd mm3, mm1             ;  [a2.M02+a3.M03 | b2.M18+b3.M19]
430
  movq mm5, qword [%3 + 24]    ;  [   M06    M07      M22    M23]
430
  movq mm5, qword [24 + %3]    ;  [   M06    M07      M22    M23]
431
  pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
431
  pmaddwd mm4, mm0             ;  [a0.M04+a1.M05 | b0.M20+b1.M21]
432
  movq mm6, qword [%3 + 32]    ;  [   M08    M09      M24    M25]
432
  movq mm6, qword [32 + %3]    ;  [   M08    M09      M24    M25]
433
  pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
433
  pmaddwd mm5, mm1             ;  [a2.M06+a3.M07 | b2.M22+b3.M23]
434
  movq mm7, qword [%3 + 40]    ;  [   M10    M11      M26    M27]
434
  movq mm7, qword [40 + %3]    ;  [   M10    M11      M26    M27]
435
  pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
435
  pmaddwd mm6, mm0             ;  [a0.M08+a1.M09 | b0.M24+b1.M25]
436
  paddd mm2, mm3               ;  [ out0 | out1 ]
436
  paddd mm2, mm3               ;  [ out0 | out1 ]
437
  pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
437
  pmaddwd mm7, mm1             ;  [a0.M10+a1.M11 | b0.M26+b1.M27]
438
  psrad mm2, 16
438
  psrad mm2, 16
439
  pmaddwd mm0, qword [%3 + 48] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
439
  pmaddwd mm0, qword [48 + %3] ;  [a0.M12+a1.M13 | b0.M28+b1.M29]
440
  paddd mm4, mm5               ;  [ out2 | out3 ]
440
  paddd mm4, mm5               ;  [ out2 | out3 ]
441
  pmaddwd mm1, qword [%3 + 56] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
441
  pmaddwd mm1, qword [56 + %3] ;  [a0.M14+a1.M15 | b0.M30+b1.M31]
442
  psrad mm4, 16
442
  psrad mm4, 16
443
443
444
  paddd mm6, mm7               ;  [ out4 | out5 ]
444
  paddd mm6, mm7               ;  [ out4 | out5 ]
Lines 467-478 MMX_One: Link Here
467
ALIGN 16
467
ALIGN 16
468
cglobal %1
468
cglobal %1
469
%1:
469
%1:
470
  push ebx
471
  call get_pc.bx
472
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
473
470
%ifdef UNROLLED_LOOP
474
%ifdef UNROLLED_LOOP
471
  mov ecx, [esp + 4]
475
  mov ecx, [esp + 4 + 4]
472
%else
476
%else
473
  push ebx
477
  push esi
474
  push edi
478
  push edi
475
  mov ecx, [esp + 8 + 4]
479
  mov ecx, [esp + 12 + 4]
476
%endif
480
%endif
477
481
478
  fLLM_PASS ecx+0, ecx+0, 3
482
  fLLM_PASS ecx+0, ecx+0, 3
Lines 481-507 cglobal %1 Link Here
481
%ifdef UNROLLED_LOOP
485
%ifdef UNROLLED_LOOP
482
%assign i 0
486
%assign i 0
483
%rep 8
487
%rep 8
484
  %2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8
488
  %2 ecx+i*16, ecx+i*16, ebx + fdct_table+i*64 wrt ..gotoff, ebx + fdct_rounding_1+i*8 wrt ..gotoff, ebx + fdct_rounding_2+i*8 wrt ..gotoff
485
	%assign i i+1
489
	%assign i i+1
486
%endrep
490
%endrep
487
%else
491
%else
488
  mov eax, 8
492
  mov eax, 8
489
  mov edx, fdct_table
493
  lea edx, [ebx + fdct_table wrt ..gotoff]
490
  mov ebx, fdct_rounding_1
494
  lea esi, [ebx + fdct_rounding_1 wrt ..gotoff]
491
  mov edi, fdct_rounding_2
495
  lea edi, [ebx + fdct_rounding_2 wrt ..gotoff]
492
.loop
496
.loop
493
  %2 ecx, ecx, edx, ebx, edi
497
  %2 ecx, ecx, edx, esi, edi
494
  add ecx, 2*8
498
  add ecx, 2*8
495
  add edx, 2*32
499
  add edx, 2*32
496
  add ebx, 2*4
500
  add esi, 2*4
497
  add edi, 2*4
501
  add edi, 2*4
498
  dec eax
502
  dec eax
499
  jne .loop
503
  jne .loop
500
504
501
  pop edi
505
  pop edi
502
  pop ebx
506
  pop esi
503
%endif
507
%endif
504
508
509
  pop ebx
505
  ret
510
  ret
506
.endfunc
511
.endfunc
507
%endmacro
512
%endmacro
Lines 512-517 cglobal %1 Link Here
512
517
513
SECTION .text
518
SECTION .text
514
519
520
extern  _GLOBAL_OFFSET_TABLE_
521
get_pc.bx:
522
  mov ebx, [esp]
523
  retn
524
515
;-----------------------------------------------------------------------------
525
;-----------------------------------------------------------------------------
516
; void fdct_mmx_skal(int16_t block[64]];
526
; void fdct_mmx_skal(int16_t block[64]];
517
;-----------------------------------------------------------------------------
527
;-----------------------------------------------------------------------------
(-)xvidcore-1.1.3-old/src/dct/x86_asm/fdct_sse2_skal.asm (-40 / +54 lines)
Lines 238-247 cglobal fdct_sse2_skal Link Here
238
  pshufd  xmm6, xmm0, 01010101b ; [13131313]
238
  pshufd  xmm6, xmm0, 01010101b ; [13131313]
239
  pshufd  xmm7, xmm0, 11111111b ; [57575757]
239
  pshufd  xmm7, xmm0, 11111111b ; [57575757]
240
240
241
  pmaddwd xmm4, [%2+ 0]   ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
241
  pmaddwd xmm4, [ 0 + %2]   ; dot [M00,M01][M04,M05][M08,M09][M12,M13]
242
  pmaddwd xmm5, [%2+16]   ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
242
  pmaddwd xmm5, [16 + %2]   ; dot [M02,M03][M06,M07][M10,M11][M14,M15]
243
  pmaddwd xmm6, [%2+32]   ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
243
  pmaddwd xmm6, [32 + %2]   ; dot [M16,M17][M20,M21][M24,M25][M28,M29]
244
  pmaddwd xmm7, [%2+48]   ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
244
  pmaddwd xmm7, [48 + %2]   ; dot [M18,M19][M22,M23][M26,M27][M30,M31]
245
  paddd   xmm4, [%3]      ; Round
245
  paddd   xmm4, [%3]      ; Round
246
246
247
  paddd   xmm6, xmm7      ; [b0|b1|b2|b3]
247
  paddd   xmm6, xmm7      ; [b0|b1|b2|b3]
Lines 267-278 cglobal fdct_sse2_skal Link Here
267
267
268
%macro iLLM_PASS 1  ; %1: src/dst
268
%macro iLLM_PASS 1  ; %1: src/dst
269
269
270
  movdqa xmm0, [tan3]     ; t3-1
270
  movdqa xmm0, [ebx + tan3 wrt ..gotoff]     ; t3-1
271
  movdqa xmm3, [%1+16*3]  ; x3
271
  movdqa xmm3, [%1+16*3]  ; x3
272
  movdqa xmm1, xmm0       ; t3-1
272
  movdqa xmm1, xmm0       ; t3-1
273
  movdqa xmm5, [%1+16*5]  ; x5
273
  movdqa xmm5, [%1+16*5]  ; x5
274
274
275
  movdqa xmm4, [tan1]     ; t1
275
  movdqa xmm4, [ebx + tan1 wrt ..gotoff]     ; t1
276
  movdqa xmm6, [%1+16*1]  ; x1
276
  movdqa xmm6, [%1+16*1]  ; x1
277
  movdqa xmm7, [%1+16*7]  ; x7
277
  movdqa xmm7, [%1+16*7]  ; x7
278
  movdqa xmm2, xmm4       ; t1
278
  movdqa xmm2, xmm4       ; t1
Lines 290-296 cglobal fdct_sse2_skal Link Here
290
  psubsw xmm2, xmm7       ; x1*t1-x7 = tm17
290
  psubsw xmm2, xmm7       ; x1*t1-x7 = tm17
291
291
292
292
293
  movdqa xmm3, [sqrt2]
293
  movdqa xmm3, [ebx + sqrt2 wrt ..gotoff]
294
  movdqa xmm7, xmm4
294
  movdqa xmm7, xmm4
295
  movdqa xmm6, xmm2
295
  movdqa xmm6, xmm2
296
  psubsw xmm4, xmm1       ; tp17-tp35 = t1
296
  psubsw xmm4, xmm1       ; tp17-tp35 = t1
Lines 310-316 cglobal fdct_sse2_skal Link Here
310
  paddsw xmm0, xmm0       ; 2.(t1+t2) = b1
310
  paddsw xmm0, xmm0       ; 2.(t1+t2) = b1
311
  paddsw xmm4, xmm4       ; 2.(t1-t2) = b2
311
  paddsw xmm4, xmm4       ; 2.(t1-t2) = b2
312
312
313
  movdqa xmm7, [tan2]     ; t2
313
  movdqa xmm7, [ebx + tan2 wrt ..gotoff]     ; t2
314
  movdqa xmm3, [%1+2*16]  ; x2
314
  movdqa xmm3, [%1+2*16]  ; x2
315
  movdqa xmm6, [%1+6*16]  ; x6
315
  movdqa xmm6, [%1+6*16]  ; x6
316
  movdqa xmm5, xmm7       ; t2
316
  movdqa xmm5, xmm7       ; t2
Lines 402-456 cglobal fdct_sse2_skal Link Here
402
402
403
ALIGN 16
403
ALIGN 16
404
idct_sse2_skal:
404
idct_sse2_skal:
405
  push ebx
406
  call get_pc.bx
407
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
405
408
406
  mov ecx, [esp+ 4]  ; Src
409
  mov ecx, [esp+ 4 +4]  ; Src
407
410
408
  TEST_ROW ecx, .Row0_Round
411
  TEST_ROW ecx, .Row0_Round
409
  iMTX_MULT  0, iTab1, Walken_Idct_Rounders + 16*0, 11
412
  iMTX_MULT  0, ebx + iTab1 wrt ..gotoff, ebx + 16*0 + Walken_Idct_Rounders wrt ..gotoff, 11
410
  jmp .Row1
413
  jmp .Row1
411
.Row0_Round
414
.Row0_Round
412
  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
415
  movdqa xmm0, [ebx + 16*8 + 8*0 + Walken_Idct_Rounders wrt ..gotoff]
413
  movdqa [ecx  ], xmm0
416
  movdqa [ecx  ], xmm0
414
417
415
.Row1
418
.Row1
416
  TEST_ROW ecx+16, .Row1_Round
419
  TEST_ROW ecx+16, .Row1_Round
417
  iMTX_MULT  1, iTab2, Walken_Idct_Rounders + 16*1, 11
420
  iMTX_MULT  1, ebx + iTab2 wrt ..gotoff, ebx + 16*1 + Walken_Idct_Rounders wrt ..gotoff, 11
418
  jmp .Row2
421
  jmp .Row2
419
.Row1_Round
422
.Row1_Round
420
  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
423
  movdqa xmm0, [ebx + 16*8 + 16*1 + Walken_Idct_Rounders wrt ..gotoff]
421
  movdqa [ecx+16  ], xmm0
424
  movdqa [ecx+16  ], xmm0
422
425
423
.Row2
426
.Row2
424
  TEST_ROW ecx+32, .Row2_Round
427
  TEST_ROW ecx+32, .Row2_Round
425
  iMTX_MULT  2, iTab3, Walken_Idct_Rounders + 16*2, 11
428
  iMTX_MULT  2, ebx + iTab3 wrt ..gotoff, ebx + 16*2 + Walken_Idct_Rounders wrt ..gotoff, 11
426
  jmp .Row3
429
  jmp .Row3
427
.Row2_Round
430
.Row2_Round
428
  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
431
  movdqa xmm0, [ebx + 16*8 + 16*2 + Walken_Idct_Rounders wrt ..gotoff]
429
  movdqa [ecx+32  ], xmm0
432
  movdqa [ecx+32  ], xmm0
430
433
431
.Row3
434
.Row3
432
  TEST_ROW ecx+48, .Row4
435
  TEST_ROW ecx+48, .Row4
433
  iMTX_MULT  3, iTab4, Walken_Idct_Rounders + 16*3, 11
436
  iMTX_MULT  3, ebx + iTab4 wrt ..gotoff, ebx + 16*3 + Walken_Idct_Rounders wrt ..gotoff, 11
434
437
435
.Row4
438
.Row4
436
  TEST_ROW ecx+64, .Row5
439
  TEST_ROW ecx+64, .Row5
437
  iMTX_MULT  4, iTab1, Walken_Idct_Rounders + 16*4, 11
440
  iMTX_MULT  4, ebx + iTab1 wrt ..gotoff, ebx + 16*4 + Walken_Idct_Rounders wrt ..gotoff, 11
438
441
439
.Row5
442
.Row5
440
  TEST_ROW ecx+80, .Row6
443
  TEST_ROW ecx+80, .Row6
441
  iMTX_MULT  5, iTab4, Walken_Idct_Rounders + 16*5, 11
444
  iMTX_MULT  5, ebx + iTab4 wrt ..gotoff, ebx + 16*5 + Walken_Idct_Rounders wrt ..gotoff, 11
442
445
443
.Row6
446
.Row6
444
  TEST_ROW ecx+96, .Row7
447
  TEST_ROW ecx+96, .Row7
445
  iMTX_MULT  6, iTab3, Walken_Idct_Rounders + 16*6, 11
448
  iMTX_MULT  6, ebx + iTab3 wrt ..gotoff, ebx + 16*6 + Walken_Idct_Rounders wrt ..gotoff, 11
446
449
447
.Row7
450
.Row7
448
  TEST_ROW ecx+112, .End
451
  TEST_ROW ecx+112, .End
449
  iMTX_MULT  7, iTab2, Walken_Idct_Rounders + 16*7, 11
452
  iMTX_MULT  7, ebx + iTab2 wrt ..gotoff, ebx + 16*7 + Walken_Idct_Rounders wrt ..gotoff, 11
450
.End
453
.End
451
454
452
  iLLM_PASS ecx
455
  iLLM_PASS ecx
453
456
  pop ebx
454
  ret
457
  ret
455
.endfunc
458
.endfunc
456
459
Lines 507-521 idct_sse2_skal: Link Here
507
  paddsw xmm2, xmm1         ; xmm2: t6+t5
510
  paddsw xmm2, xmm1         ; xmm2: t6+t5
508
  movdqa [%1+0*16], xmm5   ; => out0
511
  movdqa [%1+0*16], xmm5   ; => out0
509
512
510
  movdqa xmm4, [tan2]      ; xmm4 <= tan2
513
  movdqa xmm4, [ebx + tan2 wrt ..gotoff]      ; xmm4 <= tan2
511
  pmulhw xmm4, xmm7         ; tm03*tan2
514
  pmulhw xmm4, xmm7         ; tm03*tan2
512
  movdqa xmm5, [tan2]      ; xmm5 <= tan2
515
  movdqa xmm5, [ebx + tan2 wrt ..gotoff]      ; xmm5 <= tan2
513
  psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12
516
  psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12
514
  pmulhw xmm5, xmm6         ; tm12*tan2
517
  pmulhw xmm5, xmm6         ; tm12*tan2
515
  paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03
518
  paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03
516
519
517
  movdqa xmm6, [sqrt2]  
520
  movdqa xmm6, [ebx + sqrt2 wrt ..gotoff]
518
  movdqa xmm7, [Rounder1]
521
  movdqa xmm7, [ebx + Rounder1 wrt ..gotoff]
519
522
520
  pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4
523
  pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4
521
  por    xmm5, xmm7         ; correct out2
524
  por    xmm5, xmm7         ; correct out2
Lines 533-540 idct_sse2_skal: Link Here
533
  paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65
536
  paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65
534
  paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65
537
  paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65
535
538
536
  movdqa xmm4, [tan3]      ; tan3 - 1
539
  movdqa xmm4, [ebx + tan3 wrt ..gotoff]      ; tan3 - 1
537
  movdqa xmm5, [tan1]      ; tan1
540
  movdqa xmm5, [ebx + tan1 wrt ..gotoff]      ; tan1
538
541
539
  movdqa xmm7, xmm3         ; save tm465
542
  movdqa xmm7, xmm3         ; save tm465
540
  pmulhw xmm3, xmm4         ; tm465*(tan3-1)
543
  pmulhw xmm3, xmm4         ; tm465*(tan3-1)
Lines 581-592 idct_sse2_skal: Link Here
581
    ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']
584
    ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']
582
    ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]
585
    ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]
583
586
584
  movdqa  xmm1, [%2+16]
587
  movdqa  xmm1, [16+%2]
585
  movdqa  xmm3, [%2+32]
588
  movdqa  xmm3, [32+%2]
586
  pmaddwd xmm1, xmm2
589
  pmaddwd xmm1, xmm2
587
  pmaddwd xmm3, xmm0
590
  pmaddwd xmm3, xmm0
588
  pmaddwd xmm2, [%2+48]
591
  pmaddwd xmm2, [48+%2]
589
  pmaddwd xmm0, [%2+ 0]
592
  pmaddwd xmm0, [ 0+%2]
590
593
591
  paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]
594
  paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]
592
  paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]
595
  paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]
Lines 601-622 idct_sse2_skal: Link Here
601
  movdqa  [ecx+%1*16+0], xmm0
604
  movdqa  [ecx+%1*16+0], xmm0
602
%endmacro
605
%endmacro
603
606
607
extern  _GLOBAL_OFFSET_TABLE_
608
get_pc.bx:
609
  mov ebx, [esp]
610
  retn
611
604
;-----------------------------------------------------------------------------
612
;-----------------------------------------------------------------------------
605
; Function Forward DCT
613
; Function Forward DCT
606
;-----------------------------------------------------------------------------
614
;-----------------------------------------------------------------------------
607
615
608
ALIGN 16
616
ALIGN 16
609
fdct_sse2_skal:
617
fdct_sse2_skal:
610
  mov ecx, [esp+4]
618
  push ebx
619
  call get_pc.bx
620
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
621
622
  mov ecx, [esp+4+4]
611
  fLLM_PASS ecx+0, 3
623
  fLLM_PASS ecx+0, 3
612
  fMTX_MULT  0, fTab1, Fdct_Rnd0
624
  fMTX_MULT  0, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff
613
  fMTX_MULT  1, fTab2, Fdct_Rnd2
625
  fMTX_MULT  1, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd2 wrt ..gotoff
614
  fMTX_MULT  2, fTab3, Fdct_Rnd1
626
  fMTX_MULT  2, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
615
  fMTX_MULT  3, fTab4, Fdct_Rnd1
627
  fMTX_MULT  3, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
616
  fMTX_MULT  4, fTab1, Fdct_Rnd0
628
  fMTX_MULT  4, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff
617
  fMTX_MULT  5, fTab4, Fdct_Rnd1
629
  fMTX_MULT  5, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
618
  fMTX_MULT  6, fTab3, Fdct_Rnd1
630
  fMTX_MULT  6, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
619
  fMTX_MULT  7, fTab2, Fdct_Rnd1
631
  fMTX_MULT  7, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff
632
633
  pop ebx
620
  ret
634
  ret
621
.endfunc
635
.endfunc
622
636
(-)xvidcore-1.1.3-old/src/dct/x86_asm/idct_3dne.asm (-88 / +98 lines)
Lines 223-228 tab_i_35_xmm: Link Here
223
223
224
SECTION .text
224
SECTION .text
225
225
226
extern  _GLOBAL_OFFSET_TABLE_
227
get_pc.bx:
228
  mov ebx, [esp]
229
  retn
230
226
cglobal idct_3dne
231
cglobal idct_3dne
227
232
228
;-----------------------------------------------------------------------------
233
;-----------------------------------------------------------------------------
Lines 231-255 cglobal idct_3dne Link Here
231
236
232
ALIGN 16
237
ALIGN 16
233
idct_3dne:
238
idct_3dne:
234
  mov eax, [esp+4]
239
  push ebx
240
  call get_pc.bx
241
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
242
243
  mov eax, [esp+4+4]
235
244
236
;   DCT_8_INV_ROW_1_s [eax+64], [eax+64], tab_i_04_sse, rounder_4 ;rounder_4=0
245
;   DCT_8_INV_ROW_1_s [eax+64], [eax+64], tab_i_04_sse, rounder_4 ;rounder_4=0
237
  pshufw mm0, [eax+64],10001000b        ; x2 x0 x2 x0
246
  pshufw mm0, [eax+64],10001000b        ; x2 x0 x2 x0
238
  movq mm3, [tab_i_04_xmm]          ; 3     ; w05 w04 w01 w00
247
  movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff]          ; 3     ; w05 w04 w01 w00
239
  pshufw mm1, [eax+64+8],10001000b  ; x6 x4 x6 x4
248
  pshufw mm1, [eax+64+8],10001000b  ; x6 x4 x6 x4
240
  movq mm4, [tab_i_04_xmm+8]        ; 4     ; w07 w06 w03 w02
249
  movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff]        ; 4     ; w07 w06 w03 w02
241
  pshufw mm2, [eax+64],11011101b        ; x3 x1 x3 x1
250
  pshufw mm2, [eax+64],11011101b        ; x3 x1 x3 x1
242
  pshufw mm5, [eax+64+8],11011101b  ; x7 x5 x7 x5
251
  pshufw mm5, [eax+64+8],11011101b  ; x7 x5 x7 x5
243
  movq mm6, [tab_i_04_xmm+32]   ; 6     ; w21 w20 w17 w16
252
  movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
244
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
253
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
245
  movq mm7, [tab_i_04_xmm+40]   ; 7     ; w23 w22 w19 w18 ;
254
  movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18 ;
246
  pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
255
  pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
247
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
256
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
248
  pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
257
  pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
249
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
258
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
250
  pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
259
  pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
251
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
260
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
252
  pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
261
  pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
253
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
262
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
254
  paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
263
  paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
255
  pshufw mm1, [eax+80+8],10001000b  ; x6 x4 x6 x4
264
  pshufw mm1, [eax+80+8],10001000b  ; x6 x4 x6 x4
Lines 260-271 idct_3dne: Link Here
260
  movq mm7, mm0                 ; 7     ; a3 a2
269
  movq mm7, mm0                 ; 7     ; a3 a2
261
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
270
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
262
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
271
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
263
  movq mm3, [tab_i_35_xmm]      ; 3     ; w05 w04 w01 w00
272
  movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
264
  psubd mm7, mm2                ; ; a3-b3 a2-b2
273
  psubd mm7, mm2                ; ; a3-b3 a2-b2
265
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
274
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
266
  pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1
275
  pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1
267
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
276
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
268
  pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
277
  pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
269
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
278
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
270
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
279
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
271
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
280
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 276-294 idct_3dne: Link Here
276
  movq [eax+64], mm6            ; 3     ; save y3 y2 y1 y0 stall2
285
  movq [eax+64], mm6            ; 3     ; save y3 y2 y1 y0 stall2
277
286
278
;   DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5
287
;   DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5
279
  movq mm4, [tab_i_35_xmm+8]    ; 4     ; w07 w06 w03 w02
288
  movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
280
  movq mm6, [tab_i_35_xmm+32]   ; 6     ; w21 w20 w17 w16
289
  movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
281
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
290
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
282
  paddd mm3, [rounder_5]        ; +rounder stall 6
291
  paddd mm3, [ebx + rounder_5 wrt ..gotoff]        ; +rounder stall 6
283
  paddd mm5, [rounder_5]        ; +rounder
292
  paddd mm5, [ebx + rounder_5 wrt ..gotoff]        ; +rounder
284
  movq [eax+64+8], mm7          ; 7     ; save y7 y6 y5 y4
293
  movq [eax+64+8], mm7          ; 7     ; save y7 y6 y5 y4
285
  movq mm7, [tab_i_35_xmm+40]   ; 7     ; w23 w22 w19 w18
294
  movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
286
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
295
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
287
  pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
296
  pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
288
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
297
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
289
  pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
298
  pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
290
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
299
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
291
  pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
300
  pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
292
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
301
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
293
  paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
302
  paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
294
  pshufw mm1, [eax+96+8],10001000b  ; x6 x4 x6 x4
303
  pshufw mm1, [eax+96+8],10001000b  ; x6 x4 x6 x4
Lines 299-310 idct_3dne: Link Here
299
  movq mm7, mm5                 ; 7     ; a3 a2
308
  movq mm7, mm5                 ; 7     ; a3 a2
300
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
309
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
301
  paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
310
  paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
302
  movq mm3, [tab_i_26_xmm]      ; 3     ; w05 w04 w01 w00
311
  movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
303
  psubd mm7, mm2                ; ; a3-b3 a2-b2
312
  psubd mm7, mm2                ; ; a3-b3 a2-b2
304
  paddd mm5, mm2                ; 0 free a3+b3 a2+b2
313
  paddd mm5, mm2                ; 0 free a3+b3 a2+b2
305
  pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1
314
  pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1
306
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
315
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
307
  pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
316
  pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
308
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
317
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
309
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
318
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
310
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
319
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 315-333 idct_3dne: Link Here
315
  movq [eax+80], mm6            ; 3     ; save y3 y2 y1 y0
324
  movq [eax+80], mm6            ; 3     ; save y3 y2 y1 y0
316
325
317
;   DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6
326
;   DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6
318
  movq mm4, [tab_i_26_xmm+8]    ; 4     ; w07 w06 w03 w02
327
  movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
319
  movq mm6, [tab_i_26_xmm+32]   ; 6     ; w21 w20 w17 w16
328
  movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
320
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
329
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
321
  paddd mm3, [rounder_6]        ; +rounder
330
  paddd mm3, [ebx + rounder_6 wrt ..gotoff]        ; +rounder
322
  paddd mm0, [rounder_6]        ; +rounder
331
  paddd mm0, [ebx + rounder_6 wrt ..gotoff]        ; +rounder
323
  movq [eax+80+8], mm7          ; 7     ; save y7 y6
332
  movq [eax+80+8], mm7          ; 7     ; save y7 y6
324
  movq mm7, [tab_i_26_xmm+40]   ; 7     ; w23 w22 w19 w18
333
  movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
325
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
334
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
326
  pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
335
  pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
327
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
336
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
328
  pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
337
  pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
329
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
338
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
330
  pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
339
  pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
331
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
340
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
332
  paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
341
  paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
333
  pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4
342
  pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4
Lines 338-349 idct_3dne: Link Here
338
  movq mm7, mm0                 ; 7     ; a3 a2
347
  movq mm7, mm0                 ; 7     ; a3 a2
339
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
348
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
340
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
349
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
341
  movq mm3, [tab_i_17_xmm]      ; 3     ; w05 w04 w01 w00
350
  movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
342
  psubd mm7, mm2                ; ; a3-b3 a2-b2
351
  psubd mm7, mm2                ; ; a3-b3 a2-b2
343
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
352
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
344
  pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1
353
  pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1
345
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
354
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
346
  pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
355
  pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
347
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
356
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
348
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
357
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
349
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
358
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 354-372 idct_3dne: Link Here
354
  movq [eax+96], mm6            ; 3     ; save y3 y2 y1 y0 stall2
363
  movq [eax+96], mm6            ; 3     ; save y3 y2 y1 y0 stall2
355
364
356
;   DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7
365
;   DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7
357
  movq mm4, [tab_i_17_xmm+8]    ; 4     ; w07 w06 w03 w02
366
  movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
358
  movq mm6, [tab_i_17_xmm+32]   ; 6     ; w21 w20 w17 w16
367
  movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
359
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
368
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
360
  paddd mm3, [rounder_7]        ; +rounder stall 6
369
  paddd mm3, [ebx + rounder_7 wrt ..gotoff]        ; +rounder stall 6
361
  paddd mm5, [rounder_7]        ; +rounder
370
  paddd mm5, [ebx + rounder_7 wrt ..gotoff]        ; +rounder
362
  movq [eax+96+8], mm7          ; 7     ; save y7 y6 y5 y4
371
  movq [eax+96+8], mm7          ; 7     ; save y7 y6 y5 y4
363
  movq mm7, [tab_i_17_xmm+40]   ; 7     ; w23 w22 w19 w18
372
  movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
364
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
373
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
365
  pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
374
  pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
366
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
375
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
367
  pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
376
  pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
368
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
377
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
369
  pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
378
  pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
370
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
379
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
371
  paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
380
  paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
372
  pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4
381
  pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4
Lines 377-388 idct_3dne: Link Here
377
  movq mm7, mm5                 ; 7     ; a3 a2
386
  movq mm7, mm5                 ; 7     ; a3 a2
378
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
387
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
379
  paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
388
  paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
380
  movq mm3, [tab_i_04_xmm]      ; 3     ; w05 w04 w01 w00
389
  movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
381
  psubd mm7, mm2                ; ; a3-b3 a2-b2
390
  psubd mm7, mm2                ; ; a3-b3 a2-b2
382
  paddd mm5, mm2                ; 0 free a3+b3 a2+b2
391
  paddd mm5, mm2                ; 0 free a3+b3 a2+b2
383
  pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1
392
  pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1
384
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
393
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
385
  pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
394
  pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
386
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
395
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
387
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
396
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
388
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
397
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 393-411 idct_3dne: Link Here
393
  movq [eax+112], mm6           ; 3     ; save y3 y2 y1 y0
402
  movq [eax+112], mm6           ; 3     ; save y3 y2 y1 y0
394
403
395
;   DCT_8_INV_ROW_1_s [eax+0],  0, tab_i_04_xmm, rounder_0
404
;   DCT_8_INV_ROW_1_s [eax+0],  0, tab_i_04_xmm, rounder_0
396
  movq mm4, [tab_i_04_xmm+8]    ; 4     ; w07 w06 w03 w02
405
  movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
397
  movq mm6, [tab_i_04_xmm+32]   ; 6     ; w21 w20 w17 w16
406
  movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
398
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
407
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
399
  paddd mm3, [rounder_0]        ; +rounder
408
  paddd mm3, [ebx + rounder_0 wrt ..gotoff]        ; +rounder
400
  paddd mm0, [rounder_0]        ; +rounder
409
  paddd mm0, [ebx + rounder_0 wrt ..gotoff]        ; +rounder
401
  movq [eax+112+8], mm7         ; 7     ; save y7 y6
410
  movq [eax+112+8], mm7         ; 7     ; save y7 y6
402
  movq mm7, [tab_i_04_xmm+40]   ; 7     ; w23 w22 w19 w18
411
  movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
403
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
412
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
404
  pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
413
  pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
405
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
414
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
406
  pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
415
  pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
407
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
416
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
408
  pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
417
  pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
409
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
418
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
410
  paddd mm0, mm1                ; 1
419
  paddd mm0, mm1                ; 1
411
  pshufw mm1, [eax+16+8],10001000b  ; x6 x4 x6 x4
420
  pshufw mm1, [eax+16+8],10001000b  ; x6 x4 x6 x4
Lines 416-427 idct_3dne: Link Here
416
  movq mm7, mm0                 ; 7     ; a3 a2
425
  movq mm7, mm0                 ; 7     ; a3 a2
417
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
426
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
418
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
427
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
419
  movq mm3, [tab_i_17_xmm]      ; 3     ; w05 w04 w01 w00
428
  movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
420
  psubd mm7, mm2                ; ; a3-b3 a2-b2
429
  psubd mm7, mm2                ; ; a3-b3 a2-b2
421
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
430
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
422
  pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1
431
  pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1
423
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
432
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
424
  pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
433
  pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
425
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
434
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
426
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
435
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
427
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
436
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 432-450 idct_3dne: Link Here
432
  movq [eax+0], mm6             ; 3     ; save y3 y2 y1 y0 stall2
441
  movq [eax+0], mm6             ; 3     ; save y3 y2 y1 y0 stall2
433
442
434
; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1
443
; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1
435
  movq mm4, [tab_i_17_xmm+8]    ; 4     ; w07 w06 w03 w02
444
  movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
436
  movq mm6, [tab_i_17_xmm+32]   ; 6     ; w21 w20 w17 w16
445
  movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
437
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
446
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
438
  paddd mm3, [rounder_1]        ; +rounder stall 6
447
  paddd mm3, [ebx + rounder_1 wrt ..gotoff]        ; +rounder stall 6
439
  paddd mm5, [rounder_1]        ; +rounder
448
  paddd mm5, [ebx + rounder_1 wrt ..gotoff]        ; +rounder
440
  movq [eax+0+8], mm7           ; 7     ; save y7 y6 y5 y4
449
  movq [eax+0+8], mm7           ; 7     ; save y7 y6 y5 y4
441
  movq mm7, [tab_i_17_xmm+40]   ; 7     ; w23 w22 w19 w18
450
  movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
442
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
451
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
443
  pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
452
  pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
444
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
453
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
445
  pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
454
  pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
446
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
455
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
447
  pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
456
  pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
448
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
457
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
449
  paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
458
  paddd mm5, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
450
  pshufw mm1, [eax+32+8],10001000b  ; x6 x4 x6 x4
459
  pshufw mm1, [eax+32+8],10001000b  ; x6 x4 x6 x4
Lines 455-466 idct_3dne: Link Here
455
  movq mm7, mm5                 ; 7     ; a3 a2
464
  movq mm7, mm5                 ; 7     ; a3 a2
456
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
465
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0 stall 5
457
  paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
466
  paddd mm6, mm3                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
458
  movq mm3, [tab_i_26_xmm]      ; 3     ; w05 w04 w01 w00
467
  movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
459
  psubd mm7, mm2                ; ; a3-b3 a2-b2
468
  psubd mm7, mm2                ; ; a3-b3 a2-b2
460
  paddd mm5, mm2                ; 0 free a3+b3 a2+b2
469
  paddd mm5, mm2                ; 0 free a3+b3 a2+b2
461
  pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1
470
  pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1
462
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
471
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
463
  pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
472
  pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
464
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
473
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
465
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
474
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
466
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
475
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 471-489 idct_3dne: Link Here
471
  movq [eax+16], mm6            ; 3     ; save y3 y2 y1 y0
480
  movq [eax+16], mm6            ; 3     ; save y3 y2 y1 y0
472
481
473
;   DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2
482
;   DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2
474
  movq mm4, [tab_i_26_xmm+8]    ; 4     ; w07 w06 w03 w02
483
  movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
475
  movq mm6, [tab_i_26_xmm+32]   ; 6     ; w21 w20 w17 w16
484
  movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
476
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
485
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4 STALL 6
477
  paddd mm3, [rounder_2]        ; +rounder
486
  paddd mm3, [ebx + rounder_2 wrt ..gotoff]        ; +rounder
478
  paddd mm0, [rounder_2]        ; +rounder
487
  paddd mm0, [ebx + rounder_2 wrt ..gotoff]        ; +rounder
479
  movq [eax+16+8], mm7          ; 7     ; save y7 y6
488
  movq [eax+16+8], mm7          ; 7     ; save y7 y6
480
  movq mm7, [tab_i_26_xmm+40]   ; 7     ; w23 w22 w19 w18
489
  movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
481
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
490
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
482
  pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
491
  pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
483
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
492
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
484
  pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
493
  pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
485
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
494
  pmaddwd mm7, mm5              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
486
  pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
495
  pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
487
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
496
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
488
  paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
497
  paddd mm0, mm1                ; 1 free    ; a3=sum(even3) a2=sum(even2)
489
  pshufw mm1, [eax+48+8],10001000b      ; x6 x4 x6 x4
498
  pshufw mm1, [eax+48+8],10001000b      ; x6 x4 x6 x4
Lines 494-505 idct_3dne: Link Here
494
  movq mm7, mm0                 ; 7     ; a3 a2
503
  movq mm7, mm0                 ; 7     ; a3 a2
495
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
504
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
496
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
505
  paddd mm6, mm3                ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
497
  movq mm3, [tab_i_35_xmm]      ; 3     ; w05 w04 w01 w00
506
  movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff]      ; 3     ; w05 w04 w01 w00
498
  psubd mm7, mm2                ; ; a3-b3 a2-b2
507
  psubd mm7, mm2                ; ; a3-b3 a2-b2
499
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
508
  paddd mm0, mm2                ; 0 free a3+b3 a2+b2
500
  pshufw mm2, [eax+48],11011101b; x3 x1 x3 x1
509
  pshufw mm2, [eax+48],11011101b; x3 x1 x3 x1
501
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
510
  pmaddwd mm3, mm5              ; x2*w05+x0*w04 x2*w01+x0*w00
502
  pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08
511
  pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08
503
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
512
  psrad mm4, SHIFT_INV_ROW      ; y6=a1-b1 y7=a0-b0
504
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
513
  psrad mm7, SHIFT_INV_ROW      ; y4=a3-b3 y5=a2-b2
505
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
514
  psrad mm6, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 510-535 idct_3dne: Link Here
510
  movq [eax+32], mm6            ; 3     ; save y3 y2 y1 y0 stall2
519
  movq [eax+32], mm6            ; 3     ; save y3 y2 y1 y0 stall2
511
520
512
;   DCT_8_INV_ROW_1_s [eax+48], [eax+48], tab_i_35_xmm, rounder_3
521
;   DCT_8_INV_ROW_1_s [eax+48], [eax+48], tab_i_35_xmm, rounder_3
513
  movq mm4, [tab_i_35_xmm+8]    ; 4     ; w07 w06 w03 w02
522
  movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff]    ; 4     ; w07 w06 w03 w02
514
  movq mm6, [tab_i_35_xmm+32]   ; 6     ; w21 w20 w17 w16
523
  movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff]   ; 6     ; w21 w20 w17 w16
515
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
524
  pshufw mm7, mm7, 10110001b    ; y7 y6 y5 y4
516
  paddd mm3, [rounder_3]        ; +rounder stall 6
525
  paddd mm3, [ebx + rounder_3 wrt ..gotoff]        ; +rounder stall 6
517
  paddd mm5, [rounder_3]        ; +rounder
526
  paddd mm5, [ebx + rounder_3 wrt ..gotoff]        ; +rounder
518
  movq [eax+32+8], mm7          ; 7     ; save y7 y6 y5 y4
527
  movq [eax+32+8], mm7          ; 7     ; save y7 y6 y5 y4
519
  movq mm7, [tab_i_35_xmm+40]   ; 7     ; w23 w22 w19 w18
528
  movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff]   ; 7     ; w23 w22 w19 w18
520
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
529
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
521
  pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10
530
  pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10
522
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
531
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
523
  pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24
532
  pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24
524
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
533
  pmaddwd mm7, mm0              ; 7     ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18
525
  pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26
534
  pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26
526
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
535
  paddd mm3, mm4                ; 4 free    ; a1=sum(even1) a0=sum(even0)
527
  paddd mm5, mm1                ; mm1 free  ; a3=sum(even3) a2=sum(even2)
536
  paddd mm5, mm1                ; mm1 free  ; a3=sum(even3) a2=sum(even2)
528
  movq mm1, [tg_3_16]
537
  movq mm1, [ebx + tg_3_16 wrt ..gotoff]
529
  movq mm4, mm3                 ; 4     ; a1 a0
538
  movq mm4, mm3                 ; 4     ; a1 a0
530
  paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
539
  paddd mm6, mm7                ; 7 free    ; b1=sum(odd1) b0=sum(odd0)
531
  paddd mm2, mm0                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
540
  paddd mm2, mm0                ; 5 free    ; b3=sum(odd3) b2=sum(odd2)
532
  movq mm0, [tg_3_16]
541
  movq mm0, [ebx + tg_3_16 wrt ..gotoff]
533
  movq mm7, mm5                 ; 7     ; a3 a2
542
  movq mm7, mm5                 ; 7     ; a3 a2
534
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
543
  psubd mm4, mm6                ; 6 free    ; a1-b1 a0-b0
535
  paddd mm3, mm6                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
544
  paddd mm3, mm6                ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0
Lines 542-548 idct_3dne: Link Here
542
  psrad mm2, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
551
  psrad mm2, SHIFT_INV_ROW      ; y3=a3+b3 y2=a2+b2
543
  movq mm6, [eax+16*1]
552
  movq mm6, [eax+16*1]
544
  packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
553
  packssdw mm7, mm4             ; 4     ; y6 y7 y4 y5
545
  movq mm4, [tg_1_16]
554
  movq mm4, [ebx + tg_1_16 wrt ..gotoff]
546
  packssdw mm3, mm2             ; 0 free    ; y3 y2 y1 y0
555
  packssdw mm3, mm2             ; 0 free    ; y3 y2 y1 y0
547
  pshufw mm2, mm7, 10110001b    ; y7 y6 y5 y4
556
  pshufw mm2, mm7, 10110001b    ; y7 y6 y5 y4
548
557
Lines 559-565 idct_3dne: Link Here
559
  paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
568
  paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
560
  psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
569
  psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
561
  movq [eax+48], mm3        ; 3     ; save y3 y2 y1 y0
570
  movq [eax+48], mm3        ; 3     ; save y3 y2 y1 y0
562
  movq mm3, [ocos_4_16]
571
  movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
563
  paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
572
  paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
564
  paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
573
  paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
565
  psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
574
  psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
Lines 569-575 idct_3dne: Link Here
569
  psubsw mm6, mm0           ; tm17-tm35 = b3
578
  psubsw mm6, mm0           ; tm17-tm35 = b3
570
  psubsw mm4, mm1           ; tp17-tp35 = t1
579
  psubsw mm4, mm1           ; tp17-tp35 = t1
571
  paddsw mm2, mm0           ; tm17+tm35 = t2
580
  paddsw mm2, mm0           ; tm17+tm35 = t2
572
  movq mm7, [tg_2_16]
581
  movq mm7, [ebx + tg_2_16 wrt ..gotoff]
573
  movq mm1, mm4             ; t1
582
  movq mm1, mm4             ; t1
574
  movq [eax+3*16], mm5      ; save b0
583
  movq [eax+3*16], mm5      ; save b0
575
  paddsw mm1, mm2           ; t1+t2
584
  paddsw mm1, mm2           ; t1+t2
Lines 620-626 idct_3dne: Link Here
620
  movq mm6, mm2             ; a3
629
  movq mm6, mm2             ; a3
621
  psraw mm4, SHIFT_INV_COL  ; dst7
630
  psraw mm4, SHIFT_INV_COL  ; dst7
622
  movq [eax+5*16], mm0
631
  movq [eax+5*16], mm0
623
  movq mm0, [tg_3_16]
632
  movq mm0, [ebx + tg_3_16 wrt ..gotoff]
624
  paddsw mm2, mm3           ; a3+b3
633
  paddsw mm2, mm3           ; a3+b3
625
  movq [eax+6*16], mm7
634
  movq [eax+6*16], mm7
626
  psubsw mm6, mm3           ; a3-b3
635
  psubsw mm6, mm3           ; a3-b3
Lines 634-640 idct_3dne: Link Here
634
  movq mm5, [eax+8+16*5]
643
  movq mm5, [eax+8+16*5]
635
  psraw mm6, SHIFT_INV_COL  ; dst4
644
  psraw mm6, SHIFT_INV_COL  ; dst4
636
  pmulhw mm0, mm3           ; x3*(tg_3_16-1)
645
  pmulhw mm0, mm3           ; x3*(tg_3_16-1)
637
  movq mm4, [tg_1_16]
646
  movq mm4, [ebx + tg_1_16 wrt ..gotoff]
638
  pmulhw mm1, mm5           ; x5*(tg_3_16-1)
647
  pmulhw mm1, mm5           ; x5*(tg_3_16-1)
639
  movq mm7, [eax+8+16*7]
648
  movq mm7, [eax+8+16*7]
640
  movq [eax+3*16], mm2
649
  movq [eax+3*16], mm2
Lines 646-652 idct_3dne: Link Here
646
  pmulhw mm2, mm6           ; x1*tg_1_16
655
  pmulhw mm2, mm6           ; x1*tg_1_16
647
  paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
656
  paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
648
  psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
657
  psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
649
  movq mm3, [ocos_4_16]
658
  movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
650
  paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
659
  paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
651
  paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
660
  paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
652
  psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
661
  psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
Lines 655-661 idct_3dne: Link Here
655
  paddsw mm5, mm1           ; tp17+tp35 = b0
664
  paddsw mm5, mm1           ; tp17+tp35 = b0
656
  psubsw mm4, mm1           ; tp17-tp35 = t1
665
  psubsw mm4, mm1           ; tp17-tp35 = t1
657
  paddsw mm2, mm0           ; tm17+tm35 = t2
666
  paddsw mm2, mm0           ; tm17+tm35 = t2
658
  movq mm7, [tg_2_16]
667
  movq mm7, [ebx + tg_2_16 wrt ..gotoff]
659
  movq mm1, mm4             ; t1
668
  movq mm1, mm4             ; t1
660
  psubsw mm6, mm0           ; tm17-tm35 = b3
669
  psubsw mm6, mm0           ; tm17-tm35 = b3
661
  movq [eax+8+3*16], mm5    ; save b0
670
  movq [eax+8+3*16], mm5    ; save b0
Lines 717-722 idct_3dne: Link Here
717
  movq [eax+8+3*16], mm2
726
  movq [eax+8+3*16], mm2
718
  movq [eax+8+4*16], mm6
727
  movq [eax+8+4*16], mm6
719
728
729
  pop ebx
720
  ret
730
  ret
721
.endfunc
731
.endfunc
722
732
(-)xvidcore-1.1.3-old/src/dct/x86_asm/idct_mmx.asm (-36 / +51 lines)
Lines 326-350 tab_i_35_xmm: Link Here
326
  punpcklwd mm0, mm1        ; x5 x1 x4 x0
326
  punpcklwd mm0, mm1        ; x5 x1 x4 x0
327
  movq mm5, mm0             ; 5 ; x5 x1 x4 x0
327
  movq mm5, mm0             ; 5 ; x5 x1 x4 x0
328
  punpckldq mm0, mm0        ; x4 x0 x4 x0
328
  punpckldq mm0, mm0        ; x4 x0 x4 x0
329
  movq mm4, [%3+8]          ; 4 ; w07 w05 w03 w01
329
  movq mm4, [8+%3]          ; 4 ; w07 w05 w03 w01
330
  punpckhwd mm2, mm1        ; 1 ; x7 x3 x6 x2
330
  punpckhwd mm2, mm1        ; 1 ; x7 x3 x6 x2
331
  pmaddwd mm3, mm0          ; x4*w06+x0*w04 x4*w02+x0*w00
331
  pmaddwd mm3, mm0          ; x4*w06+x0*w04 x4*w02+x0*w00
332
  movq mm6, mm2             ; 6 ; x7 x3 x6 x2
332
  movq mm6, mm2             ; 6 ; x7 x3 x6 x2
333
  movq mm1, [%3+32]         ; 1 ; w22 w20 w18 w16
333
  movq mm1, [32+%3]         ; 1 ; w22 w20 w18 w16
334
  punpckldq mm2, mm2        ; x6 x2 x6 x2
334
  punpckldq mm2, mm2        ; x6 x2 x6 x2
335
  pmaddwd mm4, mm2          ; x6*w07+x2*w05 x6*w03+x2*w01
335
  pmaddwd mm4, mm2          ; x6*w07+x2*w05 x6*w03+x2*w01
336
  punpckhdq mm5, mm5        ; x5 x1 x5 x1
336
  punpckhdq mm5, mm5        ; x5 x1 x5 x1
337
  pmaddwd mm0, [%3+16]      ; x4*w14+x0*w12 x4*w10+x0*w08
337
  pmaddwd mm0, [16+%3]      ; x4*w14+x0*w12 x4*w10+x0*w08
338
  punpckhdq mm6, mm6        ; x7 x3 x7 x3
338
  punpckhdq mm6, mm6        ; x7 x3 x7 x3
339
  movq mm7, [%3+40]         ; 7 ; w23 w21 w19 w17
339
  movq mm7, [40+%3]         ; 7 ; w23 w21 w19 w17
340
  pmaddwd mm1, mm5          ; x5*w22+x1*w20 x5*w18+x1*w16
340
  pmaddwd mm1, mm5          ; x5*w22+x1*w20 x5*w18+x1*w16
341
  paddd mm3, [%4]           ; +%4
341
  paddd mm3, [%4]           ; +%4
342
  pmaddwd mm7, mm6          ; x7*w23+x3*w21 x7*w19+x3*w17
342
  pmaddwd mm7, mm6          ; x7*w23+x3*w21 x7*w19+x3*w17
343
  pmaddwd mm2, [%3+24]      ; x6*w15+x2*w13 x6*w11+x2*w09
343
  pmaddwd mm2, [24+%3]      ; x6*w15+x2*w13 x6*w11+x2*w09
344
  paddd mm3, mm4            ; 4 ; a1=sum(even1) a0=sum(even0)
344
  paddd mm3, mm4            ; 4 ; a1=sum(even1) a0=sum(even0)
345
  pmaddwd mm5, [%3+48]      ; x5*w30+x1*w28 x5*w26+x1*w24
345
  pmaddwd mm5, [48+%3]      ; x5*w30+x1*w28 x5*w26+x1*w24
346
  movq mm4, mm3             ; 4 ; a1 a0
346
  movq mm4, mm3             ; 4 ; a1 a0
347
  pmaddwd mm6, [%3+56]      ; x7*w31+x3*w29 x7*w27+x3*w25
347
  pmaddwd mm6, [56+%3]      ; x7*w31+x3*w29 x7*w27+x3*w25
348
  paddd mm1, mm7            ; 7 ; b1=sum(odd1) b0=sum(odd0)
348
  paddd mm1, mm7            ; 7 ; b1=sum(odd1) b0=sum(odd0)
349
  paddd mm0, [%4]           ; +%4
349
  paddd mm0, [%4]           ; +%4
350
  psubd mm3, mm1            ; a1-b1 a0-b0
350
  psubd mm3, mm1            ; a1-b1 a0-b0
Lines 378-402 tab_i_35_xmm: Link Here
378
  movq mm2, mm0                 ; 2     ; x3 x2 x1 x0
378
  movq mm2, mm0                 ; 2     ; x3 x2 x1 x0
379
  movq mm3, [%3]                ; 3     ; w05 w04 w01 w00
379
  movq mm3, [%3]                ; 3     ; w05 w04 w01 w00
380
  pshufw mm0, mm0, 10001000b    ; x2 x0 x2 x0
380
  pshufw mm0, mm0, 10001000b    ; x2 x0 x2 x0
381
  movq mm4, [%3+8]              ; 4     ; w07 w06 w03 w02
381
  movq mm4, [8+%3]              ; 4     ; w07 w06 w03 w02
382
  movq mm5, mm1                 ; 5     ; x7 x6 x5 x4
382
  movq mm5, mm1                 ; 5     ; x7 x6 x5 x4
383
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
383
  pmaddwd mm3, mm0              ; x2*w05+x0*w04 x2*w01+x0*w00
384
  movq mm6, [%3+32]             ; 6     ; w21 w20 w17 w16
384
  movq mm6, [32+%3]             ; 6     ; w21 w20 w17 w16
385
  pshufw mm1, mm1, 10001000b    ; x6 x4 x6 x4
385
  pshufw mm1, mm1, 10001000b    ; x6 x4 x6 x4
386
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
386
  pmaddwd mm4, mm1              ; x6*w07+x4*w06 x6*w03+x4*w02
387
  movq mm7, [%3+40]             ; 7    ; w23 w22 w19 w18
387
  movq mm7, [40+%3]             ; 7    ; w23 w22 w19 w18
388
  pshufw mm2, mm2, 11011101b    ; x3 x1 x3 x1
388
  pshufw mm2, mm2, 11011101b    ; x3 x1 x3 x1
389
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
389
  pmaddwd mm6, mm2              ; x3*w21+x1*w20 x3*w17+x1*w16
390
  pshufw mm5, mm5, 11011101b    ; x7 x5 x7 x5
390
  pshufw mm5, mm5, 11011101b    ; x7 x5 x7 x5
391
  pmaddwd mm7, mm5              ; x7*w23+x5*w22 x7*w19+x5*w18
391
  pmaddwd mm7, mm5              ; x7*w23+x5*w22 x7*w19+x5*w18
392
  paddd mm3, [%4]               ; +%4
392
  paddd mm3, [%4]               ; +%4
393
  pmaddwd mm0, [%3+16]          ; x2*w13+x0*w12 x2*w09+x0*w08
393
  pmaddwd mm0, [16+%3]          ; x2*w13+x0*w12 x2*w09+x0*w08
394
  paddd mm3, mm4                ; 4     ; a1=sum(even1) a0=sum(even0)
394
  paddd mm3, mm4                ; 4     ; a1=sum(even1) a0=sum(even0)
395
  pmaddwd mm1, [%3+24]          ; x6*w15+x4*w14 x6*w11+x4*w10
395
  pmaddwd mm1, [24+%3]          ; x6*w15+x4*w14 x6*w11+x4*w10
396
  movq mm4, mm3                 ; 4     ; a1 a0
396
  movq mm4, mm3                 ; 4     ; a1 a0
397
  pmaddwd mm2, [%3+48]          ; x3*w29+x1*w28 x3*w25+x1*w24
397
  pmaddwd mm2, [48+%3]          ; x3*w29+x1*w28 x3*w25+x1*w24
398
  paddd mm6, mm7                ; 7     ; b1=sum(odd1) b0=sum(odd0)
398
  paddd mm6, mm7                ; 7     ; b1=sum(odd1) b0=sum(odd0)
399
  pmaddwd mm5, [%3+56]          ; x7*w31+x5*w30 x7*w27+x5*w26
399
  pmaddwd mm5, [56+%3]          ; x7*w31+x5*w30 x7*w27+x5*w26
400
  paddd mm3, mm6                ; a1+b1 a0+b0
400
  paddd mm3, mm6                ; a1+b1 a0+b0
401
  paddd mm0, [%4]               ; +%4
401
  paddd mm0, [%4]               ; +%4
402
  psrad mm3, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
402
  psrad mm3, SHIFT_INV_ROW      ; y1=a1+b1 y0=a0+b0
Lines 480-491 tab_i_35_xmm: Link Here
480
;-----------------------------------------------------------------------------
480
;-----------------------------------------------------------------------------
481
481
482
%macro DCT_8_INV_COL 2
482
%macro DCT_8_INV_COL 2
483
  movq mm0, [tg_3_16]
483
  movq mm0, [ebx + tg_3_16 wrt ..gotoff]
484
  movq mm3, [%1+16*3]
484
  movq mm3, [%1+16*3]
485
  movq mm1, mm0             ; tg_3_16
485
  movq mm1, mm0             ; tg_3_16
486
  movq mm5, [%1+16*5]
486
  movq mm5, [%1+16*5]
487
  pmulhw mm0, mm3           ; x3*(tg_3_16-1)
487
  pmulhw mm0, mm3           ; x3*(tg_3_16-1)
488
  movq mm4, [tg_1_16]
488
  movq mm4, [ebx + tg_1_16 wrt ..gotoff]
489
  pmulhw mm1, mm5           ; x5*(tg_3_16-1)
489
  pmulhw mm1, mm5           ; x5*(tg_3_16-1)
490
  movq mm7, [%1+16*7]
490
  movq mm7, [%1+16*7]
491
  movq mm2, mm4             ; tg_1_16
491
  movq mm2, mm4             ; tg_1_16
Lines 495-501 tab_i_35_xmm: Link Here
495
  pmulhw mm2, mm6           ; x1*tg_1_16
495
  pmulhw mm2, mm6           ; x1*tg_1_16
496
  paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
496
  paddsw mm1, mm3           ; x3+x5*(tg_3_16-1)
497
  psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
497
  psubsw mm0, mm5           ; x3*tg_3_16-x5 = tm35
498
  movq mm3, [ocos_4_16]
498
  movq mm3, [ebx + ocos_4_16 wrt ..gotoff]
499
  paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
499
  paddsw mm1, mm5           ; x3+x5*tg_3_16 = tp35
500
  paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
500
  paddsw mm4, mm6           ; x1+tg_1_16*x7 = tp17
501
  psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
501
  psubsw mm2, mm7           ; x1*tg_1_16-x7 = tm17
Lines 505-511 tab_i_35_xmm: Link Here
505
  psubsw mm6, mm0           ; tm17-tm35 = b3
505
  psubsw mm6, mm0           ; tm17-tm35 = b3
506
  psubsw mm4, mm1           ; tp17-tp35 = t1
506
  psubsw mm4, mm1           ; tp17-tp35 = t1
507
  paddsw mm2, mm0           ; tm17+tm35 = t2
507
  paddsw mm2, mm0           ; tm17+tm35 = t2
508
  movq mm7, [tg_2_16]
508
  movq mm7, [ebx + tg_2_16 wrt ..gotoff]
509
  movq mm1, mm4             ; t1
509
  movq mm1, mm4             ; t1
510
;  movq [SCRATCH+0], mm5    ; save b0
510
;  movq [SCRATCH+0], mm5    ; save b0
511
  movq [%2+3*16], mm5       ; save b0
511
  movq [%2+3*16], mm5       ; save b0
Lines 577-582 tab_i_35_xmm: Link Here
577
577
578
SECTION .text
578
SECTION .text
579
579
580
extern  _GLOBAL_OFFSET_TABLE_
581
get_pc.bx:
582
    mov ebx, [esp]
583
    retn
584
580
cglobal idct_mmx
585
cglobal idct_mmx
581
cglobal idct_xmm
586
cglobal idct_xmm
582
587
Lines 586-607 cglobal idct_xmm Link Here
586
591
587
ALIGN 16
592
ALIGN 16
588
idct_mmx:
593
idct_mmx:
589
    mov eax, dword [esp + 4]
594
    push ebx
595
    call get_pc.bx
596
    add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
597
598
    mov eax, dword [esp + 4 + 4]
590
599
591
	;; Process each row
600
	;; Process each row
592
    DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, tab_i_04_mmx, rounder_0
601
    DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_0 wrt ..gotoff
593
    DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, tab_i_17_mmx, rounder_1
602
    DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_1 wrt ..gotoff
594
    DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, tab_i_26_mmx, rounder_2
603
    DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_2 wrt ..gotoff
595
    DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, tab_i_35_mmx, rounder_3
604
    DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_3 wrt ..gotoff
596
    DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, tab_i_04_mmx, rounder_4
605
    DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_4 wrt ..gotoff
597
    DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, tab_i_35_mmx, rounder_5
606
    DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_5 wrt ..gotoff
598
    DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, tab_i_26_mmx, rounder_6
607
    DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_6 wrt ..gotoff
599
    DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, tab_i_17_mmx, rounder_7
608
    DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_7 wrt ..gotoff
600
609
601
	;; Process the columns (4 at a time)
610
	;; Process the columns (4 at a time)
602
    DCT_8_INV_COL eax+0, eax+0
611
    DCT_8_INV_COL eax+0, eax+0
603
    DCT_8_INV_COL eax+8, eax+8
612
    DCT_8_INV_COL eax+8, eax+8
604
613
614
    pop ebx
605
    ret
615
    ret
606
.endfunc
616
.endfunc
607
617
Lines 611-632 idct_mmx: Link Here
611
621
612
ALIGN 16
622
ALIGN 16
613
idct_xmm:
623
idct_xmm:
614
    mov eax, dword [esp + 4]
624
    push ebx
625
    call get_pc.bx
626
    add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
627
628
    mov eax, dword [esp + 4 + 4]
615
629
616
	;; Process each row
630
	;; Process each row
617
    DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, tab_i_04_xmm, rounder_0
631
    DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_0 wrt ..gotoff
618
    DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, tab_i_17_xmm, rounder_1
632
    DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_1 wrt ..gotoff
619
    DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, tab_i_26_xmm, rounder_2
633
    DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_2 wrt ..gotoff
620
    DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, tab_i_35_xmm, rounder_3
634
    DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_3 wrt ..gotoff
621
    DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, tab_i_04_xmm, rounder_4
635
    DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_4 wrt ..gotoff
622
    DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, tab_i_35_xmm, rounder_5
636
    DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_5 wrt ..gotoff
623
    DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, tab_i_26_xmm, rounder_6
637
    DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_6 wrt ..gotoff
624
    DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, tab_i_17_xmm, rounder_7
638
    DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_7 wrt ..gotoff
625
639
626
	;; Process the columns (4 at a time)
640
	;; Process the columns (4 at a time)
627
    DCT_8_INV_COL eax+0, eax+0
641
    DCT_8_INV_COL eax+0, eax+0
628
    DCT_8_INV_COL eax+8, eax+8
642
    DCT_8_INV_COL eax+8, eax+8
629
643
644
    pop ebx
630
    ret
645
    ret
631
.endfunc
646
.endfunc
632
647
(-)xvidcore-1.1.3-old/src/dct/x86_asm/idct_sse2_dmitry.asm (-18 / +27 lines)
Lines 183-189 cglobal idct_sse2_dmitry Link Here
183
183
184
  ;a 3210 first part
184
  ;a 3210 first part
185
  pshufd xmm2, xmm1, 10101010b      ;x 64646464
185
  pshufd xmm2, xmm1, 10101010b      ;x 64646464
186
  pmaddwd xmm2, [%3+16]             ;w 15 14 11 10 7632
186
  pmaddwd xmm2, [16+%3]             ;w 15 14 11 10 7632
187
187
188
  ;a 3210 second part
188
  ;a 3210 second part
189
  paddd xmm2, xmm0                  ;a 3210 ready
189
  paddd xmm2, xmm0                  ;a 3210 ready
Lines 191-201 cglobal idct_sse2_dmitry Link Here
191
  movdqa xmm5, xmm2
191
  movdqa xmm5, xmm2
192
192
193
  pshufd xmm3, xmm1, 01010101b      ;x 31313131
193
  pshufd xmm3, xmm1, 01010101b      ;x 31313131
194
  pmaddwd xmm3, [%3+32]             ;w 29 28 25 24 21 20 17 16
194
  pmaddwd xmm3, [32+%3]             ;w 29 28 25 24 21 20 17 16
195
195
196
  ;b 3210 first part
196
  ;b 3210 first part
197
  pshufd xmm4, xmm1, 11111111b      ;x 75757575
197
  pshufd xmm4, xmm1, 11111111b      ;x 75757575
198
  pmaddwd xmm4, [%3+48]             ;w 31 30 27 26 23 22 19 18
198
  pmaddwd xmm4, [48+%3]             ;w 31 30 27 26 23 22 19 18
199
199
200
  ;b 3210 second part
200
  ;b 3210 second part
201
  paddd xmm3,xmm4                   ;b 3210 ready
201
  paddd xmm3,xmm4                   ;b 3210 ready
Lines 220-226 cglobal idct_sse2_dmitry Link Here
220
220
221
  movdqa xmm4, [%1+16*2]          	;x2
221
  movdqa xmm4, [%1+16*2]          	;x2
222
  movdqa xmm5, [%1+16*6]          	;x6
222
  movdqa xmm5, [%1+16*6]          	;x6
223
  movdqa xmm6, [tg_2_16]
223
  movdqa xmm6, [ebx + tg_2_16 wrt ..gotoff]
224
  movdqa xmm7, xmm6
224
  movdqa xmm7, xmm6
225
225
226
  paddsw xmm0, xmm2                  ;u04=x0+x4
226
  paddsw xmm0, xmm2                  ;u04=x0+x4
Lines 245-256 cglobal idct_sse2_dmitry Link Here
245
245
246
  movdqa xmm0, [%1+16*1]          	;x1
246
  movdqa xmm0, [%1+16*1]          	;x1
247
  movdqa xmm1, [%1+16*7]          	;x7
247
  movdqa xmm1, [%1+16*7]          	;x7
248
  movdqa xmm2, [tg_1_16]
248
  movdqa xmm2, [ebx + tg_1_16 wrt ..gotoff]
249
  movdqa xmm3, xmm2
249
  movdqa xmm3, xmm2
250
250
251
  movdqa xmm4, [%1+16*3]          	;x3
251
  movdqa xmm4, [%1+16*3]          	;x3
252
  movdqa xmm5, [%1+16*5]          	;x5
252
  movdqa xmm5, [%1+16*5]          	;x5
253
  movdqa xmm6, [tg_3_16]
253
  movdqa xmm6, [ebx + tg_3_16 wrt ..gotoff]
254
  movdqa xmm7, xmm6
254
  movdqa xmm7, xmm6
255
255
256
  pmulhw xmm2, xmm0
256
  pmulhw xmm2, xmm0
Lines 267-273 cglobal idct_sse2_dmitry Link Here
267
  psubsw xmm6, xmm5                  ;v35=x3*T3-x5
267
  psubsw xmm6, xmm5                  ;v35=x3*T3-x5
268
  paddsw xmm7, xmm4                  ;u35=x5*T3+x3
268
  paddsw xmm7, xmm4                  ;u35=x5*T3+x3
269
269
270
  movdqa xmm4, [ocos_4_16]
270
  movdqa xmm4, [ebx + ocos_4_16 wrt ..gotoff]
271
271
272
  paddsw xmm0, xmm7                 ;b0=u17+u35
272
  paddsw xmm0, xmm7                 ;b0=u17+u35
273
  psubsw xmm1, xmm6                 ;b3=v17-v35
273
  psubsw xmm1, xmm6                 ;b3=v17-v35
Lines 322-347 cglobal idct_sse2_dmitry Link Here
322
  movdqa [%2+16*5], xmm7
322
  movdqa [%2+16*5], xmm7
323
%endmacro
323
%endmacro
324
324
325
extern  _GLOBAL_OFFSET_TABLE_
326
get_pc.bx:
327
  mov ebx, [esp]
328
  retn
329
325
;-----------------------------------------------------------------------------
330
;-----------------------------------------------------------------------------
326
; void idct_sse2_dmitry(int16_t coeff[64]);
331
; void idct_sse2_dmitry(int16_t coeff[64]);
327
;-----------------------------------------------------------------------------
332
;-----------------------------------------------------------------------------
328
333
329
ALIGN 16
334
ALIGN 16
330
idct_sse2_dmitry:
335
idct_sse2_dmitry:
331
336
  push ebx
332
  mov eax, [esp + 4]
337
  call get_pc.bx
333
338
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
334
  DCT_8_INV_ROW_1_SSE2 eax+  0, eax+  0, tab_i_04, rounder_2_0
339
335
  DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, tab_i_17, rounder_2_1
340
  mov eax, [esp + 4 + 4]
336
  DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, tab_i_26, rounder_2_2
341
337
  DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, tab_i_35, rounder_2_3
342
  DCT_8_INV_ROW_1_SSE2 eax+  0, eax+  0, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_0 wrt ..gotoff
338
  DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, tab_i_04, rounder_2_4
343
  DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_1 wrt ..gotoff
339
  DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, tab_i_35, rounder_2_5
344
  DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_2 wrt ..gotoff
340
  DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, tab_i_26, rounder_2_6
345
  DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_3 wrt ..gotoff
341
  DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, tab_i_17, rounder_2_7
346
  DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_4 wrt ..gotoff
347
  DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_5 wrt ..gotoff
348
  DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_6 wrt ..gotoff
349
  DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_7 wrt ..gotoff
342
350
343
  DCT_8_INV_COL_4_SSE2 eax, eax
351
  DCT_8_INV_COL_4_SSE2 eax, eax
344
352
353
  pop ebx
345
  ret
354
  ret
346
.endfunc
355
.endfunc
347
356
(-)xvidcore-1.1.3-old/src/dct/x86_asm/simple_idct_mmx.asm (-107 / +122 lines)
Lines 122-128 coeffs: Link Here
122
  movq mm1,[src4]               ; R6    R2  r6  r2
122
  movq mm1,[src4]               ; R6    R2  r6  r2
123
  movq mm2,[src1]               ; R3    R1  r3  r1
123
  movq mm2,[src1]               ; R3    R1  r3  r1
124
  movq mm3,[src5]               ; R7    R5  r7  r5
124
  movq mm3,[src5]               ; R7    R5  r7  r5
125
  movq mm4,[wm1010]
125
  movq mm4,[ebx + wm1010 wrt ..gotoff]
126
  pand mm4,mm0
126
  pand mm4,mm0
127
  por mm4,mm1
127
  por mm4,mm1
128
  por mm4,mm2
128
  por mm4,mm2
Lines 131-159 coeffs: Link Here
131
  movd eax,mm4
131
  movd eax,mm4
132
  or eax,eax
132
  or eax,eax
133
  jz near .skip1
133
  jz near .skip1
134
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
134
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
135
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
135
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
136
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
136
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
137
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
137
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
138
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
138
  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
139
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
139
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
140
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
140
  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
141
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
141
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
142
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
142
  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
143
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
143
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
144
  rounder_op mm4, rounder_arg
144
  rounder_op mm4, rounder_arg
145
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
145
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
146
  paddd mm4,mm5                 ; A0        a0
146
  paddd mm4,mm5                 ; A0        a0
147
  psubd mm6,mm5                 ; A3        a3
147
  psubd mm6,mm5                 ; A3        a3
148
  movq mm5,[coeffs+56]          ; C7    C5  C7  C5
148
  movq mm5,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
149
  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
149
  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
150
  rounder_op mm0, rounder_arg
150
  rounder_op mm0, rounder_arg
151
  paddd mm1,mm0                 ; A1        a1
151
  paddd mm1,mm0                 ; A1        a1
152
  paddd mm0,mm0
152
  paddd mm0,mm0
153
  psubd mm0,mm1                 ; A2        a2
153
  psubd mm0,mm1                 ; A2        a2
154
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
154
  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
155
  paddd mm7,mm5                 ; B0        b0
155
  paddd mm7,mm5                 ; B0        b0
156
  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1
156
  movq mm5,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
157
  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
157
  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
158
  paddd mm7,mm4                 ; A0+B0     a0+b0
158
  paddd mm7,mm4                 ; A0+B0     a0+b0
159
  paddd mm4,mm4                 ; 2A0       2a0
159
  paddd mm4,mm4                 ; 2A0       2a0
Lines 170-183 coeffs: Link Here
170
  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
170
  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
171
  movq [dst],mm7
171
  movq [dst],mm7
172
  movq mm1,[src1]               ; R3    R1  r3  r1
172
  movq mm1,[src1]               ; R3    R1  r3  r1
173
  movq mm4,[coeffs+80]          ;-C1    C5  -C1     C5
173
  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ;-C1    C5  -C1     C5
174
  movq [dst + 24],mm2
174
  movq [dst + 24],mm2
175
  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
175
  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
176
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
176
  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
177
  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
177
  pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
178
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
178
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
179
  movq mm2,mm0                  ; A2        a2
179
  movq mm2,mm0                  ; A2        a2
180
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
180
  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
181
  paddd mm4,mm7                 ; B2        b2
181
  paddd mm4,mm7                 ; B2        b2
182
  paddd mm2,mm4                 ; A2+B2     a2+b2
182
  paddd mm2,mm4                 ; A2+B2     a2+b2
183
  psubd mm0,mm4                 ; a2-B2     a2-b2
183
  psubd mm0,mm4                 ; a2-B2     a2-b2
Lines 196-202 coeffs: Link Here
196
  jmp short .skip2
196
  jmp short .skip2
197
.skip1
197
.skip1
198
  pslld mm0,16
198
  pslld mm0,16
199
  paddd mm0,[d40000]
199
  paddd mm0,[ebx + d40000 wrt ..gotoff]
200
  psrad mm0,13
200
  psrad mm0,13
201
  packssdw mm0,mm0
201
  packssdw mm0,mm0
202
  movq [ dst ],mm0
202
  movq [ dst ],mm0
Lines 240-268 coeffs: Link Here
240
  movd eax,mm4
240
  movd eax,mm4
241
  or eax,eax
241
  or eax,eax
242
  jz near bt
242
  jz near bt
243
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
243
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
244
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
244
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
245
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
245
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
246
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
246
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
247
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
247
  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
248
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
248
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
249
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
249
  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
250
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
250
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
251
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
251
  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
252
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
252
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
253
  rounder_op mm4, rounder_arg
253
  rounder_op mm4, rounder_arg
254
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
254
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
255
  paddd mm4,mm5                 ; A0        a0
255
  paddd mm4,mm5                 ; A0        a0
256
  psubd mm6,mm5                 ; A3        a3
256
  psubd mm6,mm5                 ; A3        a3
257
  movq mm5,[coeffs+56]          ; C7    C5  C7  C5
257
  movq mm5,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
258
  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
258
  pmaddwd mm5,mm3               ; C7R7+C5R5 C7r7+C5r5
259
  rounder_op mm0, rounder_arg
259
  rounder_op mm0, rounder_arg
260
  paddd mm1,mm0                 ; A1        a1
260
  paddd mm1,mm0                 ; A1        a1
261
  paddd mm0,mm0
261
  paddd mm0,mm0
262
  psubd mm0,mm1                 ; A2        a2
262
  psubd mm0,mm1                 ; A2        a2
263
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
263
  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
264
  paddd mm7,mm5                 ; B0        b0
264
  paddd mm7,mm5                 ; B0        b0
265
  movq mm5,[coeffs+72]          ; -C5   -C1 -C5 -C1
265
  movq mm5,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
266
  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
266
  pmaddwd mm5,mm3               ; -C5R7-C1R5    -C5r7-C1r5
267
  paddd mm7,mm4                 ; A0+B0     a0+b0
267
  paddd mm7,mm4                 ; A0+B0     a0+b0
268
  paddd mm4,mm4                 ; 2A0       2a0
268
  paddd mm4,mm4                 ; 2A0       2a0
Lines 279-292 coeffs: Link Here
279
  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
279
  packssdw mm2,mm4              ; A0-B0 a0-b0   A1-B1   a1-b1
280
  movq [ dst ],mm7
280
  movq [ dst ],mm7
281
  movq mm1,[src1]               ; R3    R1  r3  r1
281
  movq mm1,[src1]               ; R3    R1  r3  r1
282
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
282
  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
283
  movq [ dst + 24 ],mm2
283
  movq [ dst + 24 ],mm2
284
  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
284
  pmaddwd mm4,mm1               ; -C1R3+C5R1    -C1r3+C5r1
285
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
285
  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
286
  pmaddwd mm1,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
286
  pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
287
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
287
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
288
  movq mm2,mm0                  ; A2        a2
288
  movq mm2,mm0                  ; A2        a2
289
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
289
  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
290
  paddd mm4,mm7                 ; B2        b2
290
  paddd mm4,mm7                 ; B2        b2
291
  paddd mm2,mm4                 ; A2+B2     a2+b2
291
  paddd mm2,mm4                 ; A2+B2     a2+b2
292
  psubd mm0,mm4                 ; a2-B2     a2-b2
292
  psubd mm0,mm4                 ; a2-B2     a2-b2
Lines 330-346 coeffs: Link Here
330
  movq mm1,[src4]               ; R6    R2  r6  r2
330
  movq mm1,[src4]               ; R6    R2  r6  r2
331
  movq mm2,[src1]               ; R3    R1  r3  r1
331
  movq mm2,[src1]               ; R3    R1  r3  r1
332
  movq mm3,[src5]               ; R7    R5  r7  r5
332
  movq mm3,[src5]               ; R7    R5  r7  r5
333
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
333
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
334
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
334
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
335
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
335
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
336
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
336
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
337
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
337
  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
338
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
338
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
339
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
339
  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
340
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
340
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
341
  ; rounder_op mm4, rounder_arg
341
  ; rounder_op mm4, rounder_arg
342
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
342
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
343
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
343
  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
344
  ; rounder_op mm0, rounder_arg
344
  ; rounder_op mm0, rounder_arg
345
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
345
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
346
  paddd mm4,mm5                 ; A0        a0
346
  paddd mm4,mm5                 ; A0        a0
Lines 348-358 coeffs: Link Here
348
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
348
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
349
  paddd mm0,mm1                 ; A1        a1
349
  paddd mm0,mm1                 ; A1        a1
350
  psubd mm5,mm1                 ; A2        a2
350
  psubd mm5,mm1                 ; A2        a2
351
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
351
  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
352
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
352
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
353
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
353
  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
354
  paddd mm7,mm1                 ; B0        b0
354
  paddd mm7,mm1                 ; B0        b0
355
  movq mm1,[coeffs+72]          ; -C5   -C1 -C5 -C1
355
  movq mm1,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
356
  pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
356
  pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
357
  paddd mm7,mm4                 ; A0+B0     a0+b0
357
  paddd mm7,mm4                 ; A0+B0     a0+b0
358
  paddd mm4,mm4                 ; 2A0       2a0
358
  paddd mm4,mm4                 ; 2A0       2a0
Lines 374-386 coeffs: Link Here
374
  packssdw mm4,mm4              ; A0-B0 a0-b0
374
  packssdw mm4,mm4              ; A0-B0 a0-b0
375
  movd [ dst + 112],mm4
375
  movd [ dst + 112],mm4
376
  movq mm0,[src1]               ; R3    R1  r3  r1
376
  movq mm0,[src1]               ; R3    R1  r3  r1
377
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
377
  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
378
  pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
378
  pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
379
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
379
  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
380
  pmaddwd mm0,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
380
  pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
381
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
381
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
382
  movq mm2,mm5                  ; A2        a2
382
  movq mm2,mm5                  ; A2        a2
383
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
383
  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
384
  paddd mm4,mm7                 ; B2        b2
384
  paddd mm4,mm7                 ; B2        b2
385
  paddd mm2,mm4                 ; A2+B2     a2+b2
385
  paddd mm2,mm4                 ; A2+B2     a2+b2
386
  psubd mm5,mm4                 ; a2-B2     a2-b2
386
  psubd mm5,mm4                 ; a2-B2     a2-b2
Lines 426-438 coeffs: Link Here
426
  movq mm0,[src0]               ; R4    R0  r4  r0
426
  movq mm0,[src0]               ; R4    R0  r4  r0
427
  movq mm1,[src4]               ; R6    R2  r6  r2
427
  movq mm1,[src4]               ; R6    R2  r6  r2
428
  movq mm3,[src5]               ; R7    R5  r7  r5
428
  movq mm3,[src5]               ; R7    R5  r7  r5
429
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
429
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
430
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
430
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
431
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
431
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
432
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
432
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
433
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
433
  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
434
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
434
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
435
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
435
  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
436
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
436
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
437
  ; rounder_op mm4, rounder_arg
437
  ; rounder_op mm4, rounder_arg
438
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
438
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
Lines 442-450 coeffs: Link Here
442
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
442
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
443
  paddd mm0,mm1                 ; A1        a1
443
  paddd mm0,mm1                 ; A1        a1
444
  psubd mm5,mm1                 ; A2        a2
444
  psubd mm5,mm1                 ; A2        a2
445
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
445
  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
446
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
446
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
447
  movq mm7,[coeffs+72]          ; -C5   -C1 -C5 -C1
447
  movq mm7,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
448
  pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
448
  pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
449
  paddd mm1,mm4                 ; A0+B0     a0+b0
449
  paddd mm1,mm4                 ; A0+B0     a0+b0
450
  paddd mm4,mm4                 ; 2A0       2a0
450
  paddd mm4,mm4                 ; 2A0       2a0
Lines 464-473 coeffs: Link Here
464
  movd [ dst + 96 ],mm2
464
  movd [ dst + 96 ],mm2
465
  packssdw mm4,mm4              ; A0-B0 a0-b0
465
  packssdw mm4,mm4              ; A0-B0 a0-b0
466
  movd [ dst + 112 ],mm4
466
  movd [ dst + 112 ],mm4
467
  movq mm1,[coeffs+88]          ; C3    C7  C3  C7
467
  movq mm1,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
468
  pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
468
  pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
469
  movq mm2,mm5                  ; A2        a2
469
  movq mm2,mm5                  ; A2        a2
470
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
470
  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
471
  paddd mm2,mm1                 ; A2+B2     a2+b2
471
  paddd mm2,mm1                 ; A2+B2     a2+b2
472
  psubd mm5,mm1                 ; a2-B2     a2-b2
472
  psubd mm5,mm1                 ; a2-B2     a2-b2
473
  psrad mm2,shift
473
  psrad mm2,shift
Lines 510-526 coeffs: Link Here
510
%define	shift		%8
510
%define	shift		%8
511
  movq mm0,[src0]               ; R4    R0  r4  r0
511
  movq mm0,[src0]               ; R4    R0  r4  r0
512
  movq mm3,[src5]               ; R7    R5  r7  r5
512
  movq mm3,[src5]               ; R7    R5  r7  r5
513
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
513
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
514
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
514
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
515
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
515
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
516
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
516
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
517
  ; rounder_op mm4, rounder_arg
517
  ; rounder_op mm4, rounder_arg
518
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
518
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
519
  ; rounder_op mm0, rounder_arg
519
  ; rounder_op mm0, rounder_arg
520
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
520
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
521
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
521
  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
522
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
522
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
523
  movq mm7,[coeffs+72]          ; -C5   -C1 -C5 -C1
523
  movq mm7,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
524
  pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
524
  pmaddwd mm7,mm3               ; -C5R7-C1R5    -C5r7-C1r5
525
  paddd mm1,mm4                 ; A0+B0     a0+b0
525
  paddd mm1,mm4                 ; A0+B0     a0+b0
526
  paddd mm4,mm4                 ; 2A0       2a0
526
  paddd mm4,mm4                 ; 2A0       2a0
Lines 540-549 coeffs: Link Here
540
  movd [ dst + 96 ],mm2
540
  movd [ dst + 96 ],mm2
541
  packssdw mm4,mm4              ; A0-B0 a0-b0
541
  packssdw mm4,mm4              ; A0-B0 a0-b0
542
  movd [ dst + 112 ],mm4
542
  movd [ dst + 112 ],mm4
543
  movq mm1,[coeffs+88]          ; C3    C7  C3  C7
543
  movq mm1,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
544
  pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
544
  pmaddwd mm1,mm3               ; C3R7+C7R5 C3r7+C7r5
545
  movq mm2,mm5                  ; A2        a2
545
  movq mm2,mm5                  ; A2        a2
546
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
546
  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
547
  paddd mm2,mm1                 ; A2+B2     a2+b2
547
  paddd mm2,mm1                 ; A2+B2     a2+b2
548
  psubd mm5,mm1                 ; a2-B2     a2-b2
548
  psubd mm5,mm1                 ; a2-B2     a2-b2
549
  psrad mm2,shift
549
  psrad mm2,shift
Lines 587-607 coeffs: Link Here
587
  movq mm0,[src0]               ; R4    R0  r4  r0
587
  movq mm0,[src0]               ; R4    R0  r4  r0
588
  movq mm2,[src1]               ; R3    R1  r3  r1
588
  movq mm2,[src1]               ; R3    R1  r3  r1
589
  movq mm3,[src5]               ; R7    R5  r7  r5
589
  movq mm3,[src5]               ; R7    R5  r7  r5
590
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
590
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
591
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
591
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
592
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
592
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
593
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
593
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
594
  ; rounder_op mm4, rounder_arg
594
  ; rounder_op mm4, rounder_arg
595
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
595
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
596
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
596
  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
597
  ; rounder_op mm0, rounder_arg
597
  ; rounder_op mm0, rounder_arg
598
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
598
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
599
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
599
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
600
  movq mm1,[coeffs+56]          ; C7    C5  C7  C5
600
  movq mm1,[ebx + coeffs+56 wrt ..gotoff]          ; C7    C5  C7  C5
601
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
601
  pmaddwd mm1,mm3               ; C7R7+C5R5 C7r7+C5r5
602
  pmaddwd mm2,[coeffs+64]       ; -C7R3+C3R1    -C7r3+C3r1
602
  pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff]       ; -C7R3+C3R1    -C7r3+C3r1
603
  paddd mm7,mm1                 ; B0        b0
603
  paddd mm7,mm1                 ; B0        b0
604
  movq mm1,[coeffs+72]          ; -C5   -C1 -C5 -C1
604
  movq mm1,[ebx + coeffs+72 wrt ..gotoff]          ; -C5   -C1 -C5 -C1
605
  pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
605
  pmaddwd mm1,mm3               ; -C5R7-C1R5    -C5r7-C1r5
606
  paddd mm7,mm4                 ; A0+B0     a0+b0
606
  paddd mm7,mm4                 ; A0+B0     a0+b0
607
  paddd mm4,mm4                 ; 2A0       2a0
607
  paddd mm4,mm4                 ; 2A0       2a0
Lines 623-635 coeffs: Link Here
623
  packssdw mm4,mm4              ; A0-B0 a0-b0
623
  packssdw mm4,mm4              ; A0-B0 a0-b0
624
  movd [dst + 112],mm4
624
  movd [dst + 112],mm4
625
  movq mm0,[src1]               ; R3    R1  r3  r1
625
  movq mm0,[src1]               ; R3    R1  r3  r1
626
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
626
  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
627
  pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
627
  pmaddwd mm4,mm0               ; -C1R3+C5R1    -C1r3+C5r1
628
  movq mm7,[coeffs+88]          ; C3    C7  C3  C7
628
  movq mm7,[ebx + coeffs+88 wrt ..gotoff]          ; C3    C7  C3  C7
629
  pmaddwd mm0,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
629
  pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
630
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
630
  pmaddwd mm7,mm3               ; C3R7+C7R5 C3r7+C7r5
631
  movq mm2,mm5                  ; A2        a2
631
  movq mm2,mm5                  ; A2        a2
632
  pmaddwd mm3,[coeffs+104]      ; -C1R7+C3R5    -C1r7+C3r5
632
  pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff]      ; -C1R7+C3R5    -C1r7+C3r5
633
  paddd mm4,mm7                 ; B2        b2
633
  paddd mm4,mm7                 ; B2        b2
634
  paddd mm2,mm4                 ; A2+B2     a2+b2
634
  paddd mm2,mm4                 ; A2+B2     a2+b2
635
  psubd mm5,mm4                 ; a2-B2     a2-b2
635
  psubd mm5,mm4                 ; a2-B2     a2-b2
Lines 674-690 coeffs: Link Here
674
%define shift       %8
674
%define shift       %8
675
  movq mm0,[src0]               ; R4    R0  r4  r0
675
  movq mm0,[src0]               ; R4    R0  r4  r0
676
  movq mm2,[src1]               ; R3    R1  r3  r1
676
  movq mm2,[src1]               ; R3    R1  r3  r1
677
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
677
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
678
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
678
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
679
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
679
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
680
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
680
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
681
  ; rounder_op mm4, rounder_arg
681
  ; rounder_op mm4, rounder_arg
682
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
682
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
683
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
683
  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
684
  ; rounder_op mm0, rounder_arg
684
  ; rounder_op mm0, rounder_arg
685
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
685
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
686
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
686
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
687
  movq mm3,[coeffs+64]
687
  movq mm3,[ebx + coeffs+64 wrt ..gotoff]
688
  pmaddwd mm3,mm2               ; -C7R3+C3R1    -C7r3+C3r1
688
  pmaddwd mm3,mm2               ; -C7R3+C3R1    -C7r3+C3r1
689
  paddd mm7,mm4                 ; A0+B0     a0+b0
689
  paddd mm7,mm4                 ; A0+B0     a0+b0
690
  paddd mm4,mm4                 ; 2A0       2a0
690
  paddd mm4,mm4                 ; 2A0       2a0
Lines 704-712 coeffs: Link Here
704
  movd [dst + 96],mm1
704
  movd [dst + 96],mm1
705
  packssdw mm4,mm4              ; A0-B0 a0-b0
705
  packssdw mm4,mm4              ; A0-B0 a0-b0
706
  movd [dst + 112],mm4
706
  movd [dst + 112],mm4
707
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
707
  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
708
  pmaddwd mm4,mm2               ; -C1R3+C5R1    -C1r3+C5r1
708
  pmaddwd mm4,mm2               ; -C1R3+C5R1    -C1r3+C5r1
709
  pmaddwd mm2,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
709
  pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
710
  movq mm1,mm5                  ; A2        a2
710
  movq mm1,mm5                  ; A2        a2
711
  paddd mm1,mm4                 ; A2+B2     a2+b2
711
  paddd mm1,mm4                 ; A2+B2     a2+b2
712
  psubd mm5,mm4                 ; a2-B2     a2-b2
712
  psubd mm5,mm4                 ; a2-B2     a2-b2
Lines 750-762 coeffs: Link Here
750
%define	shift		%8
750
%define	shift		%8
751
  movq mm0,[src0]               ; R4    R0  r4  r0
751
  movq mm0,[src0]               ; R4    R0  r4  r0
752
  movq mm1,[src4]               ; R6    R2  r6  r2
752
  movq mm1,[src4]               ; R6    R2  r6  r2
753
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
753
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
754
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
754
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
755
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
755
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
756
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
756
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
757
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
757
  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
758
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
758
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
759
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
759
  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
760
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
760
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
761
  ; rounder_op mm4, rounder_arg
761
  ; rounder_op mm4, rounder_arg
762
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
762
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
Lines 768-780 coeffs: Link Here
768
  psubd mm5,mm1                 ; A2        a2
768
  psubd mm5,mm1                 ; A2        a2
769
  movq mm2,[src0 + 8]           ; R4    R0  r4  r0
769
  movq mm2,[src0 + 8]           ; R4    R0  r4  r0
770
  movq mm3,[src4 + 8]           ; R6    R2  r6  r2
770
  movq mm3,[src4 + 8]           ; R6    R2  r6  r2
771
  movq mm1,[coeffs+16]          ; C4    C4  C4  C4
771
  movq mm1,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
772
  pmaddwd mm1,mm2               ; C4R4+C4R0 C4r4+C4r0
772
  pmaddwd mm1,mm2               ; C4R4+C4R0 C4r4+C4r0
773
  movq mm7,[coeffs+24]          ; -C4   C4  -C4 C4
773
  movq mm7,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
774
  pmaddwd mm2,mm7               ; -C4R4+C4R0    -C4r4+C4r0
774
  pmaddwd mm2,mm7               ; -C4R4+C4R0    -C4r4+C4r0
775
  movq mm7,[coeffs+32]          ; C6    C2  C6  C2
775
  movq mm7,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
776
  pmaddwd mm7,mm3               ; C6R6+C2R2 C6r6+C2r2
776
  pmaddwd mm7,mm3               ; C6R6+C2R2 C6r6+C2r2
777
  pmaddwd mm3,[coeffs+40]       ; -C2R6+C6R2    -C2r6+C6r2
777
  pmaddwd mm3,[ebx + coeffs+40 wrt ..gotoff]       ; -C2R6+C6R2    -C2r6+C6r2
778
  ; rounder_op mm1, rounder_arg
778
  ; rounder_op mm1, rounder_arg
779
  paddd mm7,mm1                 ; A0        a0
779
  paddd mm7,mm1                 ; A0        a0
780
  paddd mm1,mm1                 ; 2C0       2c0
780
  paddd mm1,mm1                 ; 2C0       2c0
Lines 829-845 coeffs: Link Here
829
  movq mm0,[src0]               ; R4    R0  r4  r0
829
  movq mm0,[src0]               ; R4    R0  r4  r0
830
  movq mm1,[src4]               ; R6    R2  r6  r2
830
  movq mm1,[src4]               ; R6    R2  r6  r2
831
  movq mm2,[src1]               ; R3    R1  r3  r1
831
  movq mm2,[src1]               ; R3    R1  r3  r1
832
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
832
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
833
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
833
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
834
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
834
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
835
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
835
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
836
  movq mm5,[coeffs+32]          ; C6    C2  C6  C2
836
  movq mm5,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
837
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
837
  pmaddwd mm5,mm1               ; C6R6+C2R2 C6r6+C2r2
838
  movq mm6,[coeffs+40]          ; -C2   C6  -C2 C6
838
  movq mm6,[ebx + coeffs+40 wrt ..gotoff]          ; -C2   C6  -C2 C6
839
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
839
  pmaddwd mm1,mm6               ; -C2R6+C6R2    -C2r6+C6r2
840
  ; rounder_op mm4, rounder_arg
840
  ; rounder_op mm4, rounder_arg
841
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
841
  movq mm6,mm4                  ; C4R4+C4R0 C4r4+C4r0
842
  movq mm7,[coeffs+48]          ; C3    C1  C3  C1
842
  movq mm7,[ebx + coeffs+48 wrt ..gotoff]          ; C3    C1  C3  C1
843
  ; rounder_op mm0, rounder_arg
843
  ; rounder_op mm0, rounder_arg
844
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
844
  pmaddwd mm7,mm2               ; C3R3+C1R1 C3r3+C1r1
845
  paddd mm4,mm5                 ; A0        a0
845
  paddd mm4,mm5                 ; A0        a0
Lines 847-853 coeffs: Link Here
847
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
847
  movq mm5,mm0                  ; -C4R4+C4R0    -C4r4+C4r0
848
  paddd mm0,mm1                 ; A1        a1
848
  paddd mm0,mm1                 ; A1        a1
849
  psubd mm5,mm1                 ; A2        a2
849
  psubd mm5,mm1                 ; A2        a2
850
  movq mm1,[coeffs+64]
850
  movq mm1,[ebx + coeffs+64 wrt ..gotoff]
851
  pmaddwd mm1,mm2               ; -C7R3+C3R1    -C7r3+C3r1
851
  pmaddwd mm1,mm2               ; -C7R3+C3R1    -C7r3+C3r1
852
  paddd mm7,mm4                 ; A0+B0     a0+b0
852
  paddd mm7,mm4                 ; A0+B0     a0+b0
853
  paddd mm4,mm4                 ; 2A0       2a0
853
  paddd mm4,mm4                 ; 2A0       2a0
Lines 867-875 coeffs: Link Here
867
  movd [dst + 96],mm3
867
  movd [dst + 96],mm3
868
  packssdw mm4,mm4              ; A0-B0 a0-b0
868
  packssdw mm4,mm4              ; A0-B0 a0-b0
869
  movd [dst + 112],mm4
869
  movd [dst + 112],mm4
870
  movq mm4,[coeffs+80]          ; -C1   C5  -C1     C5
870
  movq mm4,[ebx + coeffs+80 wrt ..gotoff]          ; -C1   C5  -C1     C5
871
  pmaddwd mm4,mm2               ; -C1R3+C5R1    -C1r3+C5r1
871
  pmaddwd mm4,mm2               ; -C1R3+C5R1    -C1r3+C5r1
872
  pmaddwd mm2,[coeffs+96]       ; -C5R3+C7R1    -C5r3+C7r1
872
  pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff]       ; -C5R3+C7R1    -C5r3+C7r1
873
  movq mm3,mm5                  ; A2        a2
873
  movq mm3,mm5                  ; A2        a2
874
  paddd mm3,mm4                 ; A2+B2     a2+b2
874
  paddd mm3,mm4                 ; A2+B2     a2+b2
875
  psubd mm5,mm4                 ; a2-B2     a2-b2
875
  psubd mm5,mm4                 ; a2-B2     a2-b2
Lines 912-931 coeffs: Link Here
912
%define	rounder_arg	%7
912
%define	rounder_arg	%7
913
%define	shift		%8
913
%define	shift		%8
914
  movq mm0,[src0]               ; R4    R0  r4  r0
914
  movq mm0,[src0]               ; R4    R0  r4  r0
915
  movq mm4,[coeffs+16]          ; C4    C4  C4  C4
915
  movq mm4,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
916
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
916
  pmaddwd mm4,mm0               ; C4R4+C4R0 C4r4+C4r0
917
  movq mm5,[coeffs+24]          ; -C4   C4  -C4 C4
917
  movq mm5,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
918
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
918
  pmaddwd mm0,mm5               ; -C4R4+C4R0    -C4r4+C4r0
919
  ; rounder_op mm4, rounder_arg
919
  ; rounder_op mm4, rounder_arg
920
  ; rounder_op mm0, rounder_arg
920
  ; rounder_op mm0, rounder_arg
921
  psrad mm4,shift
921
  psrad mm4,shift
922
  psrad mm0,shift
922
  psrad mm0,shift
923
  movq mm2,[src0 + 8]           ; R4    R0  r4  r0
923
  movq mm2,[src0 + 8]           ; R4    R0  r4  r0
924
  movq mm1,[coeffs+16]          ; C4    C4  C4  C4
924
  movq mm1,[ebx + coeffs+16 wrt ..gotoff]          ; C4    C4  C4  C4
925
  pmaddwd mm1,mm2               ; C4R4+C4R0 C4r4+C4r0
925
  pmaddwd mm1,mm2               ; C4R4+C4R0 C4r4+C4r0
926
  movq mm7,[coeffs+24]          ; -C4   C4  -C4 C4
926
  movq mm7,[ebx + coeffs+24 wrt ..gotoff]          ; -C4   C4  -C4 C4
927
  pmaddwd mm2,mm7               ; -C4R4+C4R0    -C4r4+C4r0
927
  pmaddwd mm2,mm7               ; -C4R4+C4R0    -C4r4+C4r0
928
  movq mm7,[coeffs+32]          ; C6    C2  C6  C2
928
  movq mm7,[ebx + coeffs+32 wrt ..gotoff]          ; C6    C2  C6  C2
929
  ; rounder_op mm1, rounder_arg
929
  ; rounder_op mm1, rounder_arg
930
  ; rounder_op mm2, rounder_arg
930
  ; rounder_op mm2, rounder_arg
931
  psrad mm1,shift
931
  psrad mm1,shift
Lines 1073-1078 coeffs: Link Here
1073
1073
1074
SECTION .text
1074
SECTION .text
1075
1075
1076
extern  _GLOBAL_OFFSET_TABLE_
1077
get_pc.bx:
1078
  mov ebx, [esp]
1079
  retn
1080
1076
cglobal simple_idct_mmx_P
1081
cglobal simple_idct_mmx_P
1077
cglobal simple_idct_mmx
1082
cglobal simple_idct_mmx
1078
1083
Lines 1083-1096 cglobal simple_idct_mmx Link Here
1083
1088
1084
ALIGN 16
1089
ALIGN 16
1085
simple_idct_mmx_P:
1090
simple_idct_mmx_P:
1091
  push ebx
1092
  call get_pc.bx
1093
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
1094
1086
  sub esp, 128
1095
  sub esp, 128
1087
  mov edx, [esp+128+4]
1096
  mov edx, [esp+128+4+4]
1088
1097
1089
;               src0,   src4,   src1,   src5,   dst,    rndop,  rndarg,     shift,  bt
1098
;               src0,   src4,   src1,   src5,   dst,    rndop,  rndarg,     shift,  bt
1090
  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [coeffs+8], 11
1099
  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [ebx + coeffs+8 wrt ..gotoff], 11
1091
  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [coeffs],   11,     .four
1100
  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .four
1092
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .two
1101
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .two
1093
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .one
1102
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .one
1094
  IDCT0         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1103
  IDCT0         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1095
  IDCT0         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1104
  IDCT0         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1096
  IDCT0         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1105
  IDCT0         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1099-1106 simple_idct_mmx_P: Link Here
1099
1108
1100
ALIGN 16
1109
ALIGN 16
1101
.four
1110
.four
1102
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .six
1111
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .six
1103
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .five
1112
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .five
1104
  IDCT4         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1113
  IDCT4         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1105
  IDCT4         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1114
  IDCT4         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1106
  IDCT4         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1115
  IDCT4         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1109-1115 ALIGN 16 Link Here
1109
1118
1110
ALIGN 16
1119
ALIGN 16
1111
.six
1120
.six
1112
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .seven
1121
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .seven
1113
  IDCT6         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1122
  IDCT6         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1114
  IDCT6         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1123
  IDCT6         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1115
  IDCT6         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1124
  IDCT6         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1118-1124 ALIGN 16 Link Here
1118
1127
1119
ALIGN 16
1128
ALIGN 16
1120
.two
1129
.two
1121
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .three
1130
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .three
1122
  IDCT2         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1131
  IDCT2         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1123
  IDCT2         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1132
  IDCT2         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1124
  IDCT2         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1133
  IDCT2         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1159-1164 ALIGN 16 Link Here
1159
.ret
1168
.ret
1160
  add esp, 128
1169
  add esp, 128
1161
1170
1171
  pop ebx
1162
  ret
1172
  ret
1163
.endfunc
1173
.endfunc
1164
1174
Lines 1174-1188 ALIGN 16 Link Here
1174
1184
1175
ALIGN 16
1185
ALIGN 16
1176
simple_idct_mmx:
1186
simple_idct_mmx:
1187
  push ebx
1188
  call get_pc.bx
1189
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
1190
1177
  sub esp, 128
1191
  sub esp, 128
1178
  mov edx, [esp+128+4]
1192
  mov edx, [esp+128+4+4]
1179
  PERMUTEP edx			; permute parm list in place
1193
  PERMUTEP edx			; permute parm list in place
1180
1194
1181
;               src0,   src4,   src1,   src5,   dst,    rndop,  rndarg,     shift,  bt
1195
;               src0,   src4,   src1,   src5,   dst,    rndop,  rndarg,     shift,  bt
1182
  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [coeffs+8], 11
1196
  DC_COND_IDCT  edx+0,  edx+8,  edx+16, edx+24, esp,    paddd,  [ebx + coeffs+8 wrt ..gotoff], 11
1183
  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [coeffs],   11,     .fourP
1197
  Z_COND_IDCT   edx+32, edx+40, edx+48, edx+56, esp+32, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .fourP
1184
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .twoP
1198
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .twoP
1185
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .oneP
1199
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .oneP
1186
  IDCT0         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1200
  IDCT0         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1187
  IDCT0         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1201
  IDCT0         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1188
  IDCT0         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1202
  IDCT0         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1191-1198 simple_idct_mmx: Link Here
1191
1205
1192
ALIGN 16
1206
ALIGN 16
1193
.fourP
1207
.fourP
1194
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [coeffs],   11,     .sixP
1208
  Z_COND_IDCT   edx+64, edx+72, edx+80, edx+88, esp+64, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .sixP
1195
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .fiveP
1209
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .fiveP
1196
  IDCT4         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1210
  IDCT4         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1197
  IDCT4         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1211
  IDCT4         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1198
  IDCT4         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1212
  IDCT4         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1201-1207 ALIGN 16 Link Here
1201
1215
1202
ALIGN 16
1216
ALIGN 16
1203
.sixP
1217
.sixP
1204
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .sevenP
1218
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .sevenP
1205
  IDCT6         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1219
  IDCT6         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1206
  IDCT6         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1220
  IDCT6         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1207
  IDCT6         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1221
  IDCT6         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1210-1216 ALIGN 16 Link Here
1210
1224
1211
ALIGN 16
1225
ALIGN 16
1212
.twoP
1226
.twoP
1213
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [coeffs],   11,     .threeP
1227
  Z_COND_IDCT   edx+96, edx+104,edx+112,edx+120,esp+96, paddd,  [ebx + coeffs wrt ..gotoff],   11,     .threeP
1214
  IDCT2         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1228
  IDCT2         esp,    esp+64, esp+32, esp+96, edx,    nop,    0,          20
1215
  IDCT2         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1229
  IDCT2         esp+8,  esp+72, esp+40, esp+104,edx+4,  nop,    0,          20
1216
  IDCT2         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
1230
  IDCT2         esp+16, esp+80, esp+48, esp+112,edx+8,  nop,    0,          20
Lines 1251-1256 ALIGN 16 Link Here
1251
.retP
1265
.retP
1252
  add esp, 128
1266
  add esp, 128
1253
1267
1268
  pop ebx
1254
  ret
1269
  ret
1255
.endfunc
1270
.endfunc
1256
1271
(-)xvidcore-1.1.3-old/src/image/x86_asm/colorspace_mmx.inc (-10 / +9 lines)
Lines 56-66 NAME: Link Here
56
  push edi    ;   esp + localsize + 4
56
  push edi    ;   esp + localsize + 4
57
  push ebp    ;   esp + localsize + 0
57
  push ebp    ;   esp + localsize + 0
58
58
59
  call get_pc.bp
60
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
61
59
%define x_dif           esp + localsize - 4
62
%define x_dif           esp + localsize - 4
60
%define y_dif           esp + localsize - 8
63
%define y_dif           esp + localsize - 8
61
%define uv_dif          esp + localsize - 12
64
%define uv_dif          esp + localsize - 12
62
%define fixed_width     esp + localsize - 16
65
%define fixed_width     esp + localsize - 16
63
%define tmp_height      esp + localsize - 20
66
%define tmp_fixed_width esp + localsize - 20
64
67
65
    sub esp, localsize
68
    sub esp, localsize
66
69
Lines 90-97 NAME: Link Here
90
  mov esi, [y_ptr]          ; $esi$ = y_ptr
93
  mov esi, [y_ptr]          ; $esi$ = y_ptr
91
  mov edi, [x_ptr]          ; $edi$ = x_ptr
94
  mov edi, [x_ptr]          ; $edi$ = x_ptr
92
  mov edx, [x_stride]       ; $edx$ = x_stride
95
  mov edx, [x_stride]       ; $edx$ = x_stride
93
  mov ebp, [height]         ; $ebp$ = height
94
95
96
96
  mov ebx, [vflip]
97
  mov ebx, [vflip]
97
  or ebx, ebx
98
  or ebx, ebx
Lines 106-112 NAME: Link Here
106
  sub ebx, edx
107
  sub ebx, edx
107
  mov [x_dif], ebx          ; x_dif = -BYTES*fixed_width - x_stride
108
  mov [x_dif], ebx          ; x_dif = -BYTES*fixed_width - x_stride
108
109
109
  mov eax, ebp
110
  mov eax, [height]
110
  sub eax, 1
111
  sub eax, 1
111
  push edx                
112
  push edx                
112
  mul edx
113
  mul edx
Lines 126-133 NAME: Link Here
126
  FUNC %+ _INIT ARG1, ARG2  ; call FUNC_INIT
127
  FUNC %+ _INIT ARG1, ARG2  ; call FUNC_INIT
127
128
128
.y_loop
129
.y_loop
129
  mov [tmp_height], ebp
130
  push dword [fixed_width]
130
  mov ebp, [fixed_width]
131
  pop dword [tmp_fixed_width]
131
132
132
.x_loop
133
.x_loop
133
  FUNC ARG1, ARG2           ; call FUNC
134
  FUNC ARG1, ARG2           ; call FUNC
Lines 137-146 NAME: Link Here
137
  add ebx, PIXELS/2         ; u_ptr += PIXELS/2
138
  add ebx, PIXELS/2         ; u_ptr += PIXELS/2
138
  add ecx, PIXELS/2         ; v_ptr += PIXELS/2
139
  add ecx, PIXELS/2         ; v_ptr += PIXELS/2
139
        
140
        
140
  sub ebp, PIXELS           ; $ebp$ -= PIXELS
141
  sub dword [tmp_fixed_width], PIXELS           ; $ebp$ -= PIXELS
141
  jg .x_loop                ; if ($ebp$ > 0) goto .x_loop
142
  jg .x_loop                ; if ($ebp$ > 0) goto .x_loop
142
143
143
  mov ebp, [tmp_height]
144
  add edi, [x_dif]          ; x_ptr += x_dif + (VPIXELS-1)*x_stride
144
  add edi, [x_dif]          ; x_ptr += x_dif + (VPIXELS-1)*x_stride
145
  add esi, [y_dif]          ; y_ptr += y_dif + (VPIXELS-1)*y_stride
145
  add esi, [y_dif]          ; y_ptr += y_dif + (VPIXELS-1)*y_stride
146
%rep VPIXELS-1
146
%rep VPIXELS-1
Lines 155-161 NAME: Link Here
155
  add ecx, [uv_stride]
155
  add ecx, [uv_stride]
156
%endrep
156
%endrep
157
157
158
  sub ebp, VPIXELS          ; $ebp$ -= VPIXELS
158
  sub dword [height], VPIXELS          ; $ebp$ -= VPIXELS
159
  jg .y_loop                ; if ($ebp$ > 0) goto .y_loop
159
  jg .y_loop                ; if ($ebp$ > 0) goto .y_loop
160
160
161
  ; cleanup stack & undef everything
161
  ; cleanup stack & undef everything
Lines 181-187 NAME: Link Here
181
%undef y_dif
181
%undef y_dif
182
%undef uv_dif
182
%undef uv_dif
183
%undef fixed_width
183
%undef fixed_width
184
%undef tmp_height
185
        ret
184
        ret
186
.endfunc
185
.endfunc
187
%undef NAME
186
%undef NAME
(-)xvidcore-1.1.3-old/src/image/x86_asm/colorspace_rgb_mmx.asm (-17 / +22 lines)
Lines 120-126 BRIGHT: db 128, 128, 128, 128, 128, 128, Link Here
120
;------------------------------------------------------------------------------
120
;------------------------------------------------------------------------------
121
121
122
%macro BGR_TO_YV12_INIT		2
122
%macro BGR_TO_YV12_INIT		2
123
  movq mm7, [y_mul]
123
  movq mm7, [ebp + y_mul wrt ..gotoff]
124
%endmacro
124
%endmacro
125
125
126
126
Lines 184-191 BRIGHT: db 128, 128, 128, 128, 128, 128, Link Here
184
184
185
  ; u_ptr, v_ptr
185
  ; u_ptr, v_ptr
186
  movq mm0, mm6                 ; = [  |b4|g4|r4]
186
  movq mm0, mm6                 ; = [  |b4|g4|r4]
187
  pmaddwd mm6, [v_mul]          ; *= V_MUL
187
  pmaddwd mm6, [ebp + v_mul wrt ..gotoff]          ; *= V_MUL
188
  pmaddwd mm0, [u_mul]          ; *= U_MUL
188
  pmaddwd mm0, [ebp + u_mul wrt ..gotoff]          ; *= U_MUL
189
  movq mm1, mm0
189
  movq mm1, mm0
190
  movq mm2, mm6
190
  movq mm2, mm6
191
  psrlq mm1, 32
191
  psrlq mm1, 32
Lines 230-259 BRIGHT: db 128, 128, 128, 128, 128, 128, Link Here
230
  movd mm3, [ecx]           ; v_ptr[0]
230
  movd mm3, [ecx]           ; v_ptr[0]
231
  punpcklbw mm2, mm7        ; u3u2u1u0 -> mm2
231
  punpcklbw mm2, mm7        ; u3u2u1u0 -> mm2
232
  punpcklbw mm3, mm7        ; v3v2v1v0 -> mm3
232
  punpcklbw mm3, mm7        ; v3v2v1v0 -> mm3
233
  psubsw mm2, [U_SUB]       ; U - 128
233
  psubsw mm2, [ebp + U_SUB wrt ..gotoff]       ; U - 128
234
  psubsw mm3, [V_SUB]       ; V - 128
234
  psubsw mm3, [ebp + V_SUB wrt ..gotoff]       ; V - 128
235
  movq mm4, mm2
235
  movq mm4, mm2
236
  movq mm5, mm3
236
  movq mm5, mm3
237
  pmullw mm2, [UG_MUL]
237
  pmullw mm2, [ebp + UG_MUL wrt ..gotoff]
238
  pmullw mm3, [VG_MUL]
238
  pmullw mm3, [ebp + VG_MUL wrt ..gotoff]
239
  movq mm6, mm2             ; u3u2u1u0 -> mm6
239
  movq mm6, mm2             ; u3u2u1u0 -> mm6
240
  punpckhwd mm2, mm2        ; u3u3u2u2 -> mm2
240
  punpckhwd mm2, mm2        ; u3u3u2u2 -> mm2
241
  punpcklwd mm6, mm6        ; u1u1u0u0 -> mm6
241
  punpcklwd mm6, mm6        ; u1u1u0u0 -> mm6
242
  pmullw mm4, [UB_MUL]      ; B_ADD -> mm4
242
  pmullw mm4, [ebp + UB_MUL wrt ..gotoff]      ; B_ADD -> mm4
243
  movq mm0, mm3
243
  movq mm0, mm3
244
  punpckhwd mm3, mm3        ; v3v3v2v2 -> mm2
244
  punpckhwd mm3, mm3        ; v3v3v2v2 -> mm2
245
  punpcklwd mm0, mm0        ; v1v1v0v0 -> mm6
245
  punpcklwd mm0, mm0        ; v1v1v0v0 -> mm6
246
  paddsw mm2, mm3
246
  paddsw mm2, mm3
247
  paddsw mm6, mm0
247
  paddsw mm6, mm0
248
  pmullw mm5, [VR_MUL]      ; R_ADD -> mm5
248
  pmullw mm5, [ebp + VR_MUL wrt ..gotoff]      ; R_ADD -> mm5
249
  movq mm0, [esi]           ; y7y6y5y4y3y2y1y0 -> mm0
249
  movq mm0, [esi]           ; y7y6y5y4y3y2y1y0 -> mm0
250
  movq mm1, mm0
250
  movq mm1, mm0
251
  punpckhbw mm1, mm7        ; y7y6y5y4 -> mm1
251
  punpckhbw mm1, mm7        ; y7y6y5y4 -> mm1
252
  punpcklbw mm0, mm7        ; y3y2y1y0 -> mm0
252
  punpcklbw mm0, mm7        ; y3y2y1y0 -> mm0
253
  psubsw mm0, [Y_SUB]       ; Y - Y_SUB
253
  psubsw mm0, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
254
  psubsw mm1, [Y_SUB]       ; Y - Y_SUB
254
  psubsw mm1, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
255
  pmullw mm1, [Y_MUL]
255
  pmullw mm1, [ebp + Y_MUL wrt ..gotoff]
256
  pmullw mm0, [Y_MUL]
256
  pmullw mm0, [ebp + Y_MUL wrt ..gotoff]
257
  movq [TEMP_Y2], mm1       ; y7y6y5y4 -> mm3
257
  movq [TEMP_Y2], mm1       ; y7y6y5y4 -> mm3
258
  movq [TEMP_Y1], mm0       ; y3y2y1y0 -> mm7
258
  movq [TEMP_Y1], mm0       ; y3y2y1y0 -> mm7
259
  psubsw mm1, mm2           ; g7g6g5g4 -> mm1
259
  psubsw mm1, mm2           ; g7g6g5g4 -> mm1
Lines 266-275 BRIGHT: db 128, 128, 128, 128, 128, 128, Link Here
266
  movq mm1, mm0
266
  movq mm1, mm0
267
  punpckhbw mm1, mm7        ; y7y6y5y4 -> mm1
267
  punpckhbw mm1, mm7        ; y7y6y5y4 -> mm1
268
  punpcklbw mm0, mm7        ; y3y2y1y0 -> mm0
268
  punpcklbw mm0, mm7        ; y3y2y1y0 -> mm0
269
  psubsw mm0, [Y_SUB]       ; Y - Y_SUB
269
  psubsw mm0, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
270
  psubsw mm1, [Y_SUB]       ; Y - Y_SUB
270
  psubsw mm1, [ebp + Y_SUB wrt ..gotoff]       ; Y - Y_SUB
271
  pmullw mm1, [Y_MUL]
271
  pmullw mm1, [ebp + Y_MUL wrt ..gotoff]
272
  pmullw mm0, [Y_MUL]
272
  pmullw mm0, [ebp + Y_MUL wrt ..gotoff]
273
  movq mm3, mm1
273
  movq mm3, mm1
274
  psubsw mm1, mm2           ; g7g6g5g4 -> mm1
274
  psubsw mm1, mm2           ; g7g6g5g4 -> mm1
275
  movq mm2, mm0
275
  movq mm2, mm0
Lines 419-424 BRIGHT: db 128, 128, 128, 128, 128, 128, Link Here
419
419
420
SECTION .text
420
SECTION .text
421
421
422
extern  _GLOBAL_OFFSET_TABLE_
423
get_pc.bp:
424
  mov ebp, [esp]
425
  retn
426
422
%include "colorspace_mmx.inc"
427
%include "colorspace_mmx.inc"
423
428
424
; input
429
; input
(-)xvidcore-1.1.3-old/src/image/x86_asm/colorspace_yuyv_mmx.asm (-3 / +8 lines)
Lines 76-82 mmx_one: dw 1, 1, 1, 1 Link Here
76
;-----------------------------------------------------------------------------
76
;-----------------------------------------------------------------------------
77
77
78
%macro YUYV_TO_YV12_INIT		2
78
%macro YUYV_TO_YV12_INIT		2
79
  movq mm7, [yuyv_mask]
79
  movq mm7, [ebp + yuyv_mask wrt ..gotoff]
80
%endmacro
80
%endmacro
81
81
82
82
Lines 108-115 mmx_one: dw 1, 1, 1, 1 Link Here
108
  pand mm5, mm7
108
  pand mm5, mm7
109
  pand mm6, mm7
109
  pand mm6, mm7
110
  paddw mm5, mm6
110
  paddw mm5, mm6
111
  paddw mm4, [mmx_one]      ; +1 rounding
111
  paddw mm4, [ebp + mmx_one wrt ..gotoff]      ; +1 rounding
112
  paddw mm5, [mmx_one]      ;
112
  paddw mm5, [ebp + mmx_one wrt ..gotoff]      ;
113
  psrlw mm4, 1
113
  psrlw mm4, 1
114
  psrlw mm5, 1
114
  psrlw mm5, 1
115
;---[ 3dnow/xmm ]----------------------------------------------------
115
;---[ 3dnow/xmm ]----------------------------------------------------
Lines 310-315 mmx_one: dw 1, 1, 1, 1 Link Here
310
310
311
SECTION .text
311
SECTION .text
312
312
313
extern  _GLOBAL_OFFSET_TABLE_
314
get_pc.bp:
315
  mov ebp, [esp]
316
  retn
317
313
%include "colorspace_mmx.inc"
318
%include "colorspace_mmx.inc"
314
319
315
; input
320
; input
(-)xvidcore-1.1.3-old/src/image/x86_asm/interpolate8x8_3dn.asm (-20 / +24 lines)
Lines 44-63 BITS 32 Link Here
44
%endmacro
44
%endmacro
45
45
46
;=============================================================================
46
;=============================================================================
47
; Read Only data
48
;=============================================================================
49
50
%ifdef FORMAT_COFF
51
SECTION .rodata
52
%else
53
SECTION .rodata align=16
54
%endif
55
56
ALIGN 16
57
mmx_one:
58
	times 8 db 1
59
60
;=============================================================================
61
; Code
47
; Code
62
;=============================================================================
48
;=============================================================================
63
49
Lines 132-138 interpolate8x8_halfpel_h_3dn: Link Here
132
118
133
.rounding1
119
.rounding1
134
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
120
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
135
  movq mm7, [mmx_one]
121
  push dword 0x01010101
122
  push dword 0x01010101
123
  movq mm7, [esp]
124
  add esp, byte 8
136
  COPY_H_3DN_RND1
125
  COPY_H_3DN_RND1
137
  lea ecx, [ecx+2*edx]
126
  lea ecx, [ecx+2*edx]
138
  COPY_H_3DN_RND1
127
  COPY_H_3DN_RND1
Lines 206-212 interpolate8x8_halfpel_v_3dn: Link Here
206
195
207
.rounding1
196
.rounding1
208
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
197
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
209
  movq mm7, [mmx_one]
198
  push dword 0x01010101
199
  push dword 0x01010101
200
  movq mm7, [esp]
201
  add esp, byte 8
210
  movq mm2, [eax]       ; loop invariant
202
  movq mm2, [eax]       ; loop invariant
211
  add eax, edx
203
  add eax, edx
212
204
Lines 329-335 interpolate8x8_halfpel_hv_3dn Link Here
329
  mov eax, [esp+ 8] ; Src
321
  mov eax, [esp+ 8] ; Src
330
  mov edx, [esp+12] ; stride
322
  mov edx, [esp+12] ; stride
331
323
332
  movq mm7, [mmx_one]
324
  push dword 0x01010101
325
  push dword 0x01010101
326
  movq mm7, [esp]
327
  add esp, byte 8
333
328
334
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
329
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
335
  movq mm2, [eax]
330
  movq mm2, [eax]
Lines 387-393 interpolate8x4_halfpel_h_3dn: Link Here
387
382
388
.rounding1
383
.rounding1
389
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
384
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
390
  movq mm7, [mmx_one]
385
  push dword 0x01010101
386
  push dword 0x01010101
387
  movq mm7, [esp]
388
  add esp, byte 8
391
  COPY_H_3DN_RND1
389
  COPY_H_3DN_RND1
392
  lea ecx, [ecx+2*edx]
390
  lea ecx, [ecx+2*edx]
393
  COPY_H_3DN_RND1
391
  COPY_H_3DN_RND1
Lines 424-430 interpolate8x4_halfpel_v_3dn: Link Here
424
422
425
.rounding1
423
.rounding1
426
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
424
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
427
  movq mm7, [mmx_one]
425
  push dword 0x01010101
426
  push dword 0x01010101
427
  movq mm7, [esp]
428
  add esp, byte 8
428
  movq mm2, [eax]       ; loop invariant
429
  movq mm2, [eax]       ; loop invariant
429
  add eax, edx
430
  add eax, edx
430
431
Lines 462-468 interpolate8x4_halfpel_hv_3dn Link Here
462
  mov eax, [esp+ 8] ; Src
463
  mov eax, [esp+ 8] ; Src
463
  mov edx, [esp+12] ; stride
464
  mov edx, [esp+12] ; stride
464
465
465
  movq mm7, [mmx_one]
466
  push dword 0x01010101
467
  push dword 0x01010101
468
  movq mm7, [esp]
469
  add esp, byte 8
466
470
467
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
471
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
468
  movq mm2, [eax]
472
  movq mm2, [eax]
(-)xvidcore-1.1.3-old/src/image/x86_asm/interpolate8x8_3dne.asm (-42 / +34 lines)
Lines 45-68 BITS 32 Link Here
45
%endmacro
45
%endmacro
46
46
47
;=============================================================================
47
;=============================================================================
48
; Read only data
49
;=============================================================================
50
51
%ifdef FORMAT_COFF
52
SECTION .rodata
53
%else
54
SECTION .rodata align=16
55
%endif
56
57
ALIGN 16
58
mmx_one:
59
	times 8 db 1
60
61
ALIGN 8
62
mm_minusone:
63
	dd -1,-1
64
65
;=============================================================================
66
; Macros
48
; Macros
67
;=============================================================================
49
;=============================================================================
68
50
Lines 149-155 interpolate8x8_halfpel_h_3dne: Link Here
149
.rounding1
131
.rounding1
150
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
132
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
151
  mov ecx, [esp+ 4] ; Dst
133
  mov ecx, [esp+ 4] ; Dst
152
  movq mm7, [mmx_one]
134
  push dword 0x01010101
135
  push dword 0x01010101
136
  movq mm7, [esp]
137
  add esp, byte 8
153
  COPY_H_SSE_RND1
138
  COPY_H_SSE_RND1
154
  lea ecx, [ecx+2*edx]
139
  lea ecx, [ecx+2*edx]
155
  COPY_H_SSE_RND1
140
  COPY_H_SSE_RND1
Lines 223-237 ALIGN 8 Link Here
223
  psubusb mm0, [eax]
208
  psubusb mm0, [eax]
224
  add eax, edx
209
  add eax, edx
225
  mov ecx, [esp+ 4] ; Dst
210
  mov ecx, [esp+ 4] ; Dst
226
  push esi
211
  push byte -1
212
  push byte -1
227
  pcmpeqb mm1, mm1
213
  pcmpeqb mm1, mm1
228
  pcmpeqb mm2, mm2
214
  pcmpeqb mm2, mm2
229
  mov esi, mm_minusone
230
  psubusb mm1, [byte eax]
215
  psubusb mm1, [byte eax]
231
  psubusb mm2, [eax+edx]
216
  psubusb mm2, [eax+edx]
232
  lea eax, [eax+2*edx]
217
  lea eax, [eax+2*edx]
233
  movq mm6, [esi]
218
  movq mm6, [esp]
234
  movq mm7, [esi]
219
  movq mm7, [esp]
235
  pavgb mm0, mm1
220
  pavgb mm0, mm1
236
  pavgb mm1, mm2
221
  pavgb mm1, mm2
237
  psubusb mm6, mm0
222
  psubusb mm6, mm0
Lines 246-253 ALIGN 8 Link Here
246
  lea eax, [eax+2*edx]
231
  lea eax, [eax+2*edx]
247
  pavgb mm2, mm3
232
  pavgb mm2, mm3
248
  pavgb mm3, mm4
233
  pavgb mm3, mm4
249
  movq mm0, [esi]
234
  movq mm0, [esp]
250
  movq mm1, [esi]
235
  movq mm1, [esp]
251
  psubusb mm0, mm2
236
  psubusb mm0, mm2
252
  psubusb mm1, mm3
237
  psubusb mm1, mm3
253
  movq [ecx], mm0
238
  movq [ecx], mm0
Lines 261-268 ALIGN 8 Link Here
261
  lea eax, [eax+2*edx]
246
  lea eax, [eax+2*edx]
262
  pavgb mm4, mm5
247
  pavgb mm4, mm5
263
  pavgb mm5, mm6
248
  pavgb mm5, mm6
264
  movq mm2, [esi]
249
  movq mm2, [esp]
265
  movq mm3, [esi]
250
  movq mm3, [esp]
266
  psubusb mm2, mm4
251
  psubusb mm2, mm4
267
  psubusb mm3, mm5
252
  psubusb mm3, mm5
268
  movq [ecx], mm2
253
  movq [ecx], mm2
Lines 274-283 ALIGN 8 Link Here
274
  psubusb mm0, [eax+edx]
259
  psubusb mm0, [eax+edx]
275
  pavgb mm6, mm7
260
  pavgb mm6, mm7
276
  pavgb mm7, mm0
261
  pavgb mm7, mm0
277
  movq mm4, [esi]
262
  movq mm4, [esp]
278
  movq mm5, [esi]
263
  movq mm5, [esp]
279
  psubusb mm4, mm6
264
  psubusb mm4, mm6
280
  pop esi
265
  add esp, byte 8
281
  psubusb mm5, mm7
266
  psubusb mm5, mm7
282
  movq [ecx], mm4
267
  movq [ecx], mm4
283
  movq [ecx+edx], mm5
268
  movq [ecx+edx], mm5
Lines 391-397 interpolate8x8_halfpel_hv_3dne: Link Here
391
  pavgb mm2, mm3
376
  pavgb mm2, mm3
392
  pxor mm3, mm6         ; mm2/mm3 ready
377
  pxor mm3, mm6         ; mm2/mm3 ready
393
  mov ecx, [esp+ 4]     ; Dst
378
  mov ecx, [esp+ 4]     ; Dst
394
  movq mm7, [mmx_one]
379
  push dword 0x01010101
380
  push dword 0x01010101
381
  movq mm7, [esp]
382
  add esp, byte 8
395
383
396
  jz near .rounding1
384
  jz near .rounding1
397
  lea ebp,[byte ebp]
385
  lea ebp,[byte ebp]
Lines 443-449 interpolate8x4_halfpel_h_3dne: Link Here
443
.rounding1
431
.rounding1
444
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
432
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
445
  mov ecx, [esp+ 4] ; Dst
433
  mov ecx, [esp+ 4] ; Dst
446
  movq mm7, [mmx_one]
434
  push dword 0x01010101
435
  push dword 0x01010101
436
  movq mm7, [esp]
437
  add esp, byte 8
447
  COPY_H_SSE_RND1
438
  COPY_H_SSE_RND1
448
  lea ecx, [ecx+2*edx]
439
  lea ecx, [ecx+2*edx]
449
  COPY_H_SSE_RND1
440
  COPY_H_SSE_RND1
Lines 501-516 ALIGN 8 Link Here
501
  add eax, edx                  ; eax==line1
492
  add eax, edx                  ; eax==line1
502
  mov ecx, [esp+ 4] ; Dst
493
  mov ecx, [esp+ 4] ; Dst
503
494
504
  push esi
505
506
  pcmpeqb mm1, mm1
495
  pcmpeqb mm1, mm1
507
  pcmpeqb mm2, mm2
496
  pcmpeqb mm2, mm2
508
  mov esi, mm_minusone
497
  push byte -1
498
  push byte -1
509
  psubusb mm1, [byte eax]       ; line1
499
  psubusb mm1, [byte eax]       ; line1
510
  psubusb mm2, [eax+edx]        ; line2
500
  psubusb mm2, [eax+edx]        ; line2
511
  lea eax, [eax+2*edx]          ; eax==line3
501
  lea eax, [eax+2*edx]          ; eax==line3
512
  movq mm6, [esi]
502
  movq mm6, [esp]
513
  movq mm7, [esi]
503
  movq mm7, [esp]
514
  pavgb mm0, mm1
504
  pavgb mm0, mm1
515
  pavgb mm1, mm2
505
  pavgb mm1, mm2
516
  psubusb mm6, mm0
506
  psubusb mm6, mm0
Lines 526-540 ALIGN 8 Link Here
526
  lea eax, [eax+2*edx]          ; eax==line 5
516
  lea eax, [eax+2*edx]          ; eax==line 5
527
  pavgb mm2, mm3
517
  pavgb mm2, mm3
528
  pavgb mm3, mm4
518
  pavgb mm3, mm4
529
  movq mm0, [esi]
519
  movq mm0, [esp]
530
  movq mm1, [esi]
520
  movq mm1, [esp]
521
  add esp, byte 8
531
  psubusb mm0, mm2
522
  psubusb mm0, mm2
532
  psubusb mm1, mm3
523
  psubusb mm1, mm3
533
  movq [ecx], mm0
524
  movq [ecx], mm0
534
  movq [ecx+edx], mm1
525
  movq [ecx+edx], mm1
535
526
536
  pop esi
537
538
  ret
527
  ret
539
528
540
.endfunc
529
.endfunc
Lines 562-568 interpolate8x4_halfpel_hv_3dne: Link Here
562
  pavgb mm2, mm3
551
  pavgb mm2, mm3
563
  pxor mm3, mm6         ; mm2/mm3 ready
552
  pxor mm3, mm6         ; mm2/mm3 ready
564
  mov ecx, [esp+ 4]     ; Dst
553
  mov ecx, [esp+ 4]     ; Dst
565
  movq mm7, [mmx_one]
554
  push dword 0x01010101
555
  push dword 0x01010101
556
  movq mm7, [esp]
557
  lea esp, [esp + 8]
566
558
567
  jz near .rounding1
559
  jz near .rounding1
568
  lea ebp,[byte ebp]
560
  lea ebp,[byte ebp]
(-)xvidcore-1.1.3-old/src/image/x86_asm/interpolate8x8_mmx.asm (-98 / +165 lines)
Lines 166-178 interpolate8x8_halfpel_h_mmx: Link Here
166
166
167
  push esi
167
  push esi
168
  push edi
168
  push edi
169
  mov eax, [esp + 8 + 16]       ; rounding
169
  push ebp
170
  call get_pc.bp
171
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
170
172
171
  movq mm7, [rounding1_mmx + eax * 8]
173
  mov eax, [esp + 12 + 16]       ; rounding
172
174
173
  mov edi, [esp + 8 + 4]        ; dst
175
  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
174
  mov esi, [esp + 8 + 8]        ; src
176
175
  mov edx, [esp + 8 + 12]       ; stride
177
  mov edi, [esp + 12 + 4]        ; dst
178
  mov esi, [esp + 12 + 8]        ; src
179
  mov edx, [esp + 12 + 12]       ; stride
176
180
177
  pxor mm6, mm6                 ; zero
181
  pxor mm6, mm6                 ; zero
178
182
Lines 185-190 interpolate8x8_halfpel_h_mmx: Link Here
185
  COPY_H_MMX
189
  COPY_H_MMX
186
  COPY_H_MMX
190
  COPY_H_MMX
187
191
192
  pop ebp
188
  pop edi
193
  pop edi
189
  pop esi
194
  pop esi
190
195
Lines 225-237 interpolate8x8_halfpel_v_mmx: Link Here
225
  push esi
230
  push esi
226
  push edi
231
  push edi
227
232
228
  mov eax, [esp + 8 + 16]       ; rounding
233
  push ebp
234
  call get_pc.bp
235
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
236
237
  mov eax, [esp + 12 + 16]       ; rounding
229
238
230
  movq mm7, [rounding1_mmx + eax * 8]
239
  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
231
240
232
  mov edi, [esp + 8 + 4]        ; dst
241
  mov edi, [esp + 12 + 4]        ; dst
233
  mov esi, [esp + 8 + 8]        ; src
242
  mov esi, [esp + 12 + 8]        ; src
234
  mov edx, [esp + 8 + 12]       ; stride
243
  mov edx, [esp + 12 + 12]       ; stride
235
244
236
  pxor mm6, mm6                 ; zero
245
  pxor mm6, mm6                 ; zero
237
246
Lines 245-250 interpolate8x8_halfpel_v_mmx: Link Here
245
  COPY_V_MMX
254
  COPY_V_MMX
246
  COPY_V_MMX
255
  COPY_V_MMX
247
256
257
  pop ebp
248
  pop edi
258
  pop edi
249
  pop esi
259
  pop esi
250
260
Lines 315-332 interpolate8x8_halfpel_hv_mmx: Link Here
315
  push esi
325
  push esi
316
  push edi
326
  push edi
317
327
318
  mov eax, [esp + 8 + 16]   ; rounding
328
  push ebp
329
  call get_pc.bp
330
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
319
331
320
  movq mm7, [rounding2_mmx + eax * 8]
332
  mov eax, [esp + 12 + 16]   ; rounding
321
333
322
  mov edi, [esp + 8 + 4]    ; dst
334
  movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff]
323
  mov esi, [esp + 8 + 8]    ; src
335
336
  mov edi, [esp + 12 + 4]    ; dst
337
  mov esi, [esp + 12 + 8]    ; src
324
338
325
  mov eax, 8
339
  mov eax, 8
326
340
327
  pxor mm6, mm6             ; zero
341
  pxor mm6, mm6             ; zero
328
342
329
  mov edx, [esp + 8 + 12]   ; stride
343
  mov edx, [esp + 12 + 12]   ; stride
330
344
331
  COPY_HV_MMX
345
  COPY_HV_MMX
332
  COPY_HV_MMX
346
  COPY_HV_MMX
Lines 337-342 interpolate8x8_halfpel_hv_mmx: Link Here
337
  COPY_HV_MMX
351
  COPY_HV_MMX
338
  COPY_HV_MMX
352
  COPY_HV_MMX
339
353
354
  pop ebp
340
  pop edi
355
  pop edi
341
  pop esi
356
  pop esi
342
357
Lines 357-369 interpolate8x4_halfpel_h_mmx: Link Here
357
372
358
  push esi
373
  push esi
359
  push edi
374
  push edi
360
  mov eax, [esp + 8 + 16]       ; rounding
361
375
362
  movq mm7, [rounding1_mmx + eax * 8]
376
  push ebp
377
  call get_pc.bp
378
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
379
380
  mov eax, [esp + 12 + 16]       ; rounding
363
381
364
  mov edi, [esp + 8 + 4]        ; dst
382
  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
365
  mov esi, [esp + 8 + 8]        ; src
383
366
  mov edx, [esp + 8 + 12]       ; stride
384
  mov edi, [esp + 12 + 4]        ; dst
385
  mov esi, [esp + 12 + 8]        ; src
386
  mov edx, [esp + 12 + 12]       ; stride
367
387
368
  pxor mm6, mm6                 ; zero
388
  pxor mm6, mm6                 ; zero
369
389
Lines 372-377 interpolate8x4_halfpel_h_mmx: Link Here
372
  COPY_H_MMX
392
  COPY_H_MMX
373
  COPY_H_MMX
393
  COPY_H_MMX
374
394
395
  pop ebp
375
  pop edi
396
  pop edi
376
  pop esi
397
  pop esi
377
398
Lines 394-406 interpolate8x4_halfpel_v_mmx: Link Here
394
  push esi
415
  push esi
395
  push edi
416
  push edi
396
417
397
  mov eax, [esp + 8 + 16]       ; rounding
418
  push ebp
419
  call get_pc.bp
420
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
421
422
  mov eax, [esp + 12 + 16]       ; rounding
398
423
399
  movq mm7, [rounding1_mmx + eax * 8]
424
  movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff]
400
425
401
  mov edi, [esp + 8 + 4]        ; dst
426
  mov edi, [esp + 12 + 4]        ; dst
402
  mov esi, [esp + 8 + 8]        ; src
427
  mov esi, [esp + 12 + 8]        ; src
403
  mov edx, [esp + 8 + 12]       ; stride
428
  mov edx, [esp + 12 + 12]       ; stride
404
429
405
  pxor mm6, mm6                 ; zero
430
  pxor mm6, mm6                 ; zero
406
431
Lines 410-415 interpolate8x4_halfpel_v_mmx: Link Here
410
  COPY_V_MMX
435
  COPY_V_MMX
411
  COPY_V_MMX
436
  COPY_V_MMX
412
437
438
  pop ebp
413
  pop edi
439
  pop edi
414
  pop esi
440
  pop esi
415
441
Lines 433-456 interpolate8x4_halfpel_hv_mmx: Link Here
433
  push esi
459
  push esi
434
  push edi
460
  push edi
435
461
436
  mov eax, [esp + 8 + 16]   ; rounding
462
  push ebp
463
  call get_pc.bp
464
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
437
465
438
  movq mm7, [rounding2_mmx + eax * 8]
466
  mov eax, [esp + 12 + 16]   ; rounding
439
467
440
  mov edi, [esp + 8 + 4]    ; dst
468
  movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff]
441
  mov esi, [esp + 8 + 8]    ; src
469
470
  mov edi, [esp + 12 + 4]    ; dst
471
  mov esi, [esp + 12 + 8]    ; src
442
472
443
  mov eax, 8
473
  mov eax, 8
444
474
445
  pxor mm6, mm6             ; zero
475
  pxor mm6, mm6             ; zero
446
476
447
  mov edx, [esp + 8 + 12]   ; stride
477
  mov edx, [esp + 12 + 12]   ; stride
448
478
449
  COPY_HV_MMX
479
  COPY_HV_MMX
450
  COPY_HV_MMX
480
  COPY_HV_MMX
451
  COPY_HV_MMX
481
  COPY_HV_MMX
452
  COPY_HV_MMX
482
  COPY_HV_MMX
453
483
484
  pop ebp
454
  pop edi
485
  pop edi
455
  pop esi
486
  pop esi
456
487
Lines 491-500 interpolate8x4_halfpel_hv_mmx: Link Here
491
522
492
  por mm3, mm6
523
  por mm3, mm6
493
524
494
  pand mm0, [mmx_mask]
525
  pand mm0, [ebp + mmx_mask wrt ..gotoff]
495
  pand mm1, [mmx_mask]
526
  pand mm1, [ebp + mmx_mask wrt ..gotoff]
496
  pand mm4, [mmx_mask]
527
  pand mm4, [ebp + mmx_mask wrt ..gotoff]
497
  pand mm5, [mmx_mask]
528
  pand mm5, [ebp + mmx_mask wrt ..gotoff]
498
529
499
  psrlq mm0, 1              ; src1 / 2
530
  psrlq mm0, 1              ; src1 / 2
500
  psrlq mm1, 1              ; src2 / 2
531
  psrlq mm1, 1              ; src2 / 2
Lines 538-547 interpolate8x4_halfpel_hv_mmx: Link Here
538
569
539
  pand mm3, mm6
570
  pand mm3, mm6
540
571
541
  pand mm0, [mmx_mask]
572
  pand mm0, [ebp + mmx_mask wrt ..gotoff]
542
  pand mm1, [mmx_mask]
573
  pand mm1, [ebp + mmx_mask wrt ..gotoff]
543
  pand mm4, [mmx_mask]
574
  pand mm4, [ebp + mmx_mask wrt ..gotoff]
544
  pand mm5, [mmx_mask]
575
  pand mm5, [ebp + mmx_mask wrt ..gotoff]
545
576
546
  psrlq mm0, 1              ; src1 / 2
577
  psrlq mm0, 1              ; src1 / 2
547
  psrlq mm1, 1              ; src2 / 2
578
  psrlq mm1, 1              ; src2 / 2
Lines 567-587 interpolate8x8_avg2_mmx: Link Here
567
598
568
  push ebx
599
  push ebx
569
600
570
  mov eax, [esp + 4 + 20]   ; rounding
601
  push ebp
602
  call get_pc.bp
603
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
604
605
  mov eax, [esp + 8 + 20]   ; rounding
571
  test eax, eax
606
  test eax, eax
572
607
573
  jnz near .rounding1
608
  jnz near .rounding1
574
609
575
  mov eax, [esp + 4 + 24]   ; height -> eax
610
  mov eax, [esp + 8 + 24]   ; height -> eax
576
  sub eax, 8
611
  sub eax, 8
577
  test eax, eax
612
  test eax, eax
578
613
579
  mov ecx, [esp + 4 + 4]    ; dst -> edi
614
  mov ecx, [esp + 8 + 4]    ; dst -> edi
580
  mov eax, [esp + 4 + 8]    ; src1 -> esi
615
  mov eax, [esp + 8 + 8]    ; src1 -> esi
581
  mov ebx, [esp + 4 + 12]   ; src2 -> eax
616
  mov ebx, [esp + 8 + 12]   ; src2 -> eax
582
  mov edx, [esp + 4 + 16]   ; stride -> edx
617
  mov edx, [esp + 8 + 16]   ; stride -> edx
583
618
584
  movq mm7, [mmx_one]
619
  movq mm7, [ebp + mmx_one wrt ..gotoff]
585
620
586
  jz near .start0
621
  jz near .start0
587
622
Lines 598-617 interpolate8x8_avg2_mmx: Link Here
598
  lea ecx, [ecx+2*edx]
633
  lea ecx, [ecx+2*edx]
599
  AVG2_MMX_RND0
634
  AVG2_MMX_RND0
600
635
636
  pop ebp
601
  pop ebx
637
  pop ebx
602
  ret
638
  ret
603
639
604
.rounding1
640
.rounding1
605
  mov eax, [esp + 4 + 24]       ; height -> eax
641
  mov eax, [esp + 8 + 24]       ; height -> eax
606
  sub eax, 8
642
  sub eax, 8
607
  test eax, eax
643
  test eax, eax
608
644
609
  mov ecx, [esp + 4 + 4]        ; dst -> edi
645
  mov ecx, [esp + 8 + 4]        ; dst -> edi
610
  mov eax, [esp + 4 + 8]        ; src1 -> esi
646
  mov eax, [esp + 8 + 8]        ; src1 -> esi
611
  mov ebx, [esp + 4 + 12]       ; src2 -> eax
647
  mov ebx, [esp + 8 + 12]       ; src2 -> eax
612
  mov edx, [esp + 4 + 16]       ; stride -> edx
648
  mov edx, [esp + 8 + 16]       ; stride -> edx
613
649
614
  movq mm7, [mmx_one]
650
  movq mm7, [ebp + mmx_one wrt ..gotoff]
615
651
616
  jz near .start1
652
  jz near .start1
617
653
Lines 628-633 interpolate8x8_avg2_mmx: Link Here
628
  lea ecx, [ecx+2*edx]
664
  lea ecx, [ecx+2*edx]
629
  AVG2_MMX_RND1
665
  AVG2_MMX_RND1
630
666
667
  pop ebp
631
  pop ebx
668
  pop ebx
632
  ret
669
  ret
633
.endfunc
670
.endfunc
Lines 652-662 interpolate8x8_avg2_mmx: Link Here
652
  movq mm2, mm0
689
  movq mm2, mm0
653
  movq mm3, mm1
690
  movq mm3, mm1
654
691
655
  pand mm2, [mmx_three]
692
  pand mm2, [ebp + mmx_three wrt ..gotoff]
656
  pand mm3, [mmx_three]
693
  pand mm3, [ebp + mmx_three wrt ..gotoff]
657
694
658
  pand mm0, [mmx_mask2]
695
  pand mm0, [ebp + mmx_mask2 wrt ..gotoff]
659
  pand mm1, [mmx_mask2]
696
  pand mm1, [ebp + mmx_mask2 wrt ..gotoff]
660
697
661
  psrlq mm0, 2
698
  psrlq mm0, 2
662
  psrlq mm1, 2
699
  psrlq mm1, 2
Lines 673-683 interpolate8x8_avg2_mmx: Link Here
673
  movq mm1, mm4
710
  movq mm1, mm4
674
  movq mm3, mm5
711
  movq mm3, mm5
675
712
676
  pand mm1, [mmx_three]
713
  pand mm1, [ebp + mmx_three wrt ..gotoff]
677
  pand mm3, [mmx_three]
714
  pand mm3, [ebp + mmx_three wrt ..gotoff]
678
715
679
  pand mm4, [mmx_mask2]
716
  pand mm4, [ebp + mmx_mask2 wrt ..gotoff]
680
  pand mm5, [mmx_mask2]
717
  pand mm5, [ebp + mmx_mask2 wrt ..gotoff]
681
718
682
  psrlq mm4, 2
719
  psrlq mm4, 2
683
  psrlq mm5, 2
720
  psrlq mm5, 2
Lines 688-695 interpolate8x8_avg2_mmx: Link Here
688
  paddb mm1, mm3
725
  paddb mm1, mm3
689
  paddb mm2, mm1
726
  paddb mm2, mm1
690
727
691
  paddb mm2, [mmx_two]
728
  paddb mm2, [ebp + mmx_two wrt ..gotoff]
692
  pand mm2, [mmx_mask2]
729
  pand mm2, [ebp + mmx_mask2 wrt ..gotoff]
693
730
694
  psrlq mm2, 2
731
  psrlq mm2, 2
695
  paddb mm0, mm2
732
  paddb mm0, mm2
Lines 707-717 interpolate8x8_avg2_mmx: Link Here
707
  movq mm2, mm0
744
  movq mm2, mm0
708
  movq mm3, mm1
745
  movq mm3, mm1
709
746
710
  pand mm2, [mmx_three]
747
  pand mm2, [ebp + mmx_three wrt ..gotoff]
711
  pand mm3, [mmx_three]
748
  pand mm3, [ebp + mmx_three wrt ..gotoff]
712
749
713
  pand mm0, [mmx_mask2]
750
  pand mm0, [ebp + mmx_mask2 wrt ..gotoff]
714
  pand mm1, [mmx_mask2]
751
  pand mm1, [ebp + mmx_mask2 wrt ..gotoff]
715
752
716
  psrlq mm0, 2
753
  psrlq mm0, 2
717
  psrlq mm1, 2
754
  psrlq mm1, 2
Lines 728-738 interpolate8x8_avg2_mmx: Link Here
728
  movq mm1, mm4
765
  movq mm1, mm4
729
  movq mm3, mm5
766
  movq mm3, mm5
730
767
731
  pand mm1, [mmx_three]
768
  pand mm1, [ebp + mmx_three wrt ..gotoff]
732
  pand mm3, [mmx_three]
769
  pand mm3, [ebp + mmx_three wrt ..gotoff]
733
770
734
  pand mm4, [mmx_mask2]
771
  pand mm4, [ebp + mmx_mask2 wrt ..gotoff]
735
  pand mm5, [mmx_mask2]
772
  pand mm5, [ebp + mmx_mask2 wrt ..gotoff]
736
773
737
  psrlq mm4, 2
774
  psrlq mm4, 2
738
  psrlq mm5, 2
775
  psrlq mm5, 2
Lines 743-750 interpolate8x8_avg2_mmx: Link Here
743
  paddb mm1, mm3
780
  paddb mm1, mm3
744
  paddb mm2, mm1
781
  paddb mm2, mm1
745
782
746
  paddb mm2, [mmx_one]
783
  paddb mm2, [ebp + mmx_one wrt ..gotoff]
747
  pand mm2, [mmx_mask2]
784
  pand mm2, [ebp + mmx_mask2 wrt ..gotoff]
748
785
749
  psrlq mm2, 2
786
  psrlq mm2, 2
750
  paddb mm0, mm2
787
  paddb mm0, mm2
Lines 762-779 interpolate8x8_avg4_mmx: Link Here
762
  push edi
799
  push edi
763
  push esi
800
  push esi
764
801
765
  mov eax, [esp + 12 + 28]      ; rounding
802
  push ebp
803
  call get_pc.bp
804
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
805
806
  mov eax, [esp + 16 + 28]      ; rounding
766
807
767
  test eax, eax
808
  test eax, eax
768
809
769
  mov ecx, [esp + 12 + 4]       ; dst -> edi
810
  mov ecx, [esp + 16 + 4]       ; dst -> edi
770
  mov eax, [esp + 12 + 8]       ; src1 -> esi
811
  mov eax, [esp + 16 + 8]       ; src1 -> esi
771
  mov ebx, [esp + 12 + 12]      ; src2 -> eax
812
  mov ebx, [esp + 16 + 12]      ; src2 -> eax
772
  mov esi, [esp + 12 + 16]      ; src3 -> esi
813
  mov esi, [esp + 16 + 16]      ; src3 -> esi
773
  mov edi, [esp + 12 + 20]      ; src4 -> edi
814
  mov edi, [esp + 16 + 20]      ; src4 -> edi
774
  mov edx, [esp + 12 + 24]      ; stride -> edx
815
  mov edx, [esp + 16 + 24]      ; stride -> edx
775
816
776
  movq mm7, [mmx_one]
817
  movq mm7, [ebp + mmx_one wrt ..gotoff]
777
818
778
  jnz near .rounding1
819
  jnz near .rounding1
779
820
Lines 793-798 interpolate8x8_avg4_mmx: Link Here
793
  lea ecx, [ecx+edx]
834
  lea ecx, [ecx+edx]
794
  AVG4_MMX_RND0
835
  AVG4_MMX_RND0
795
836
837
  pop ebp
796
  pop esi
838
  pop esi
797
  pop edi
839
  pop edi
798
  pop ebx
840
  pop ebx
Lines 815-820 interpolate8x8_avg4_mmx: Link Here
815
  lea ecx, [ecx+edx]
857
  lea ecx, [ecx+edx]
816
  AVG4_MMX_RND1
858
  AVG4_MMX_RND1
817
859
860
  pop ebp
818
  pop esi
861
  pop esi
819
  pop edi
862
  pop edi
820
  pop ebx
863
  pop ebx
Lines 868-875 interpolate8x8_avg4_mmx: Link Here
868
  psubsw mm0, mm2
911
  psubsw mm0, mm2
869
  psubsw mm1, mm3
912
  psubsw mm1, mm3
870
913
871
  pmullw mm0, [mmx_five]
914
  pmullw mm0, [ebp + mmx_five wrt ..gotoff]
872
  pmullw mm1, [mmx_five]
915
  pmullw mm1, [ebp + mmx_five wrt ..gotoff]
873
916
874
  movq mm2, [eax-2]
917
  movq mm2, [eax-2]
875
  movq mm4, [eax+3]
918
  movq mm4, [eax+3]
Lines 903-915 interpolate8x8_avg4_mmx: Link Here
903
ALIGN 16
946
ALIGN 16
904
interpolate8x8_6tap_lowpass_h_mmx:
947
interpolate8x8_6tap_lowpass_h_mmx:
905
948
906
  mov eax, [esp + 16]           ; rounding
949
  push ebp
950
  call get_pc.bp
951
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
952
953
  mov eax, [esp + 20]           ; rounding
907
954
908
  movq mm6, [rounding_lowpass_mmx + eax * 8]
955
  movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff]
909
956
910
  mov ecx, [esp + 4]            ; dst -> edi
957
  mov ecx, [esp + 8]            ; dst -> edi
911
  mov eax, [esp + 8]            ; src -> esi
958
  mov eax, [esp + 12]            ; src -> esi
912
  mov edx, [esp + 12]           ; stride -> edx
959
  mov edx, [esp + 16]           ; stride -> edx
913
960
914
  pxor mm7, mm7
961
  pxor mm7, mm7
915
962
Lines 929-934 interpolate8x8_6tap_lowpass_h_mmx: Link Here
929
  lea ecx, [ecx+edx]
976
  lea ecx, [ecx+edx]
930
  LOWPASS_6TAP_H_MMX
977
  LOWPASS_6TAP_H_MMX
931
978
979
  pop ebp
932
  ret
980
  ret
933
.endfunc
981
.endfunc
934
982
Lines 979-986 interpolate8x8_6tap_lowpass_h_mmx: Link Here
979
  psubsw mm0, mm2
1027
  psubsw mm0, mm2
980
  psubsw mm1, mm3
1028
  psubsw mm1, mm3
981
1029
982
  pmullw mm0, [mmx_five]
1030
  pmullw mm0, [ebp + mmx_five wrt ..gotoff]
983
  pmullw mm1, [mmx_five]
1031
  pmullw mm1, [ebp + mmx_five wrt ..gotoff]
984
1032
985
  movq mm2, [eax+edx]
1033
  movq mm2, [eax+edx]
986
  movq mm4, [eax+2*ebx]
1034
  movq mm4, [eax+2*ebx]
Lines 1016-1028 interpolate8x8_6tap_lowpass_v_mmx: Link Here
1016
1064
1017
  push ebx
1065
  push ebx
1018
1066
1019
  mov eax, [esp + 4 + 16]           ; rounding
1067
  push ebp
1068
  call get_pc.bp
1069
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
1020
1070
1021
  movq mm6, [rounding_lowpass_mmx + eax * 8]
1071
  mov eax, [esp + 8 + 16]           ; rounding
1022
1072
1023
  mov ecx, [esp + 4 + 4]            ; dst -> edi
1073
  movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff]
1024
  mov eax, [esp + 4 + 8]            ; src -> esi
1074
1025
  mov edx, [esp + 4 + 12]           ; stride -> edx
1075
  mov ecx, [esp + 8 + 4]            ; dst -> edi
1076
  mov eax, [esp + 8 + 8]            ; src -> esi
1077
  mov edx, [esp + 8 + 12]           ; stride -> edx
1026
1078
1027
  mov ebx, edx
1079
  mov ebx, edx
1028
  shl ebx, 1
1080
  shl ebx, 1
Lines 1046-1051 interpolate8x8_6tap_lowpass_v_mmx: Link Here
1046
  lea ecx, [ecx+edx]
1098
  lea ecx, [ecx+edx]
1047
  LOWPASS_6TAP_V_MMX
1099
  LOWPASS_6TAP_V_MMX
1048
1100
1101
  pop ebp
1049
  pop ebx
1102
  pop ebx
1050
  ret
1103
  ret
1051
.endfunc
1104
.endfunc
Lines 1066-1077 interpolate8x8_6tap_lowpass_v_mmx: Link Here
1066
1119
1067
%macro PROLOG 2   ; %1: Rounder, %2 load Dst-Rounder
1120
%macro PROLOG 2   ; %1: Rounder, %2 load Dst-Rounder
1068
  pxor mm6, mm6
1121
  pxor mm6, mm6
1069
  movq mm7, [%1]    ; TODO: dangerous! (eax isn't checked)
1122
  PROLOG0
1123
1124
  push ebp
1125
  call get_pc.bp
1126
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
1127
1070
%if %2
1128
%if %2
1071
  movq mm5, [rounding1_mmx]
1129
  movq mm5, [ebp + rounding1_mmx wrt ..gotoff]
1072
%endif
1130
%endif
1073
1131
1074
  PROLOG0
1132
  movq mm7, [ebp + %1 wrt ..gotoff]    ; TODO: dangerous! (eax isn't checked)
1075
%endmacro
1133
%endmacro
1076
1134
1077
  ; performs: mm0 == (mm0+mm2)  mm1 == (mm1+mm3)
1135
  ; performs: mm0 == (mm0+mm2)  mm1 == (mm1+mm3)
Lines 1160-1165 interpolate8x8_halfpel_add_mmx: Link Here
1160
  ADD_FF_MMX 1
1218
  ADD_FF_MMX 1
1161
  ADD_FF_MMX 1
1219
  ADD_FF_MMX 1
1162
  ADD_FF_MMX 0
1220
  ADD_FF_MMX 0
1221
  pop ebp
1163
  ret
1222
  ret
1164
.endfunc
1223
.endfunc
1165
1224
Lines 1206-1211 interpolate8x8_halfpel_h_add_mmx: Link Here
1206
  ADD_FH_MMX
1265
  ADD_FH_MMX
1207
  lea ecx,[ecx+edx]
1266
  lea ecx,[ecx+edx]
1208
  ADD_FH_MMX
1267
  ADD_FH_MMX
1268
  pop ebp
1209
  ret
1269
  ret
1210
.endfunc
1270
.endfunc
1211
1271
Lines 1253-1258 interpolate8x8_halfpel_v_add_mmx: Link Here
1253
  ADD_HF_MMX
1313
  ADD_HF_MMX
1254
  lea ecx,[ecx+edx]
1314
  lea ecx,[ecx+edx]
1255
  ADD_HF_MMX
1315
  ADD_HF_MMX
1316
  pop ebp
1256
  ret
1317
  ret
1257
.endfunc
1318
.endfunc
1258
1319
Lines 1318-1325 interpolate8x8_halfpel_v_add_mmx: Link Here
1318
  paddusw mm0, mm4  ; mix Src(mm0/mm1) with Dst(mm2/mm3)
1379
  paddusw mm0, mm4  ; mix Src(mm0/mm1) with Dst(mm2/mm3)
1319
  paddusw mm1, mm5
1380
  paddusw mm1, mm5
1320
1381
1321
  paddusw mm0, [rounding1_mmx]
1382
  paddusw mm0, [ebp + rounding1_mmx wrt ..gotoff]
1322
  paddusw mm1, [rounding1_mmx]
1383
  paddusw mm1, [ebp + rounding1_mmx wrt ..gotoff]
1323
1384
1324
  psrlw mm0, 1
1385
  psrlw mm0, 1
1325
  psrlw mm1, 1
1386
  psrlw mm1, 1
Lines 1329-1334 interpolate8x8_halfpel_v_add_mmx: Link Here
1329
  movq [ecx], mm0
1390
  movq [ecx], mm0
1330
%endmacro
1391
%endmacro
1331
1392
1393
extern  _GLOBAL_OFFSET_TABLE_
1394
get_pc.bp:
1395
  mov ebp, [esp]
1396
  retn
1397
1332
ALIGN 16
1398
ALIGN 16
1333
interpolate8x8_halfpel_hv_add_mmx:
1399
interpolate8x8_halfpel_hv_add_mmx:
1334
  PROLOG rounding2_mmx, 0    ; mm5 is busy. Don't load dst-rounder
1400
  PROLOG rounding2_mmx, 0    ; mm5 is busy. Don't load dst-rounder
Lines 1364-1369 interpolate8x8_halfpel_hv_add_mmx: Link Here
1364
  lea ecx,[ecx+edx]
1430
  lea ecx,[ecx+edx]
1365
  ADD_HH_MMX
1431
  ADD_HH_MMX
1366
1432
1433
  pop ebp
1367
  ret
1434
  ret
1368
.endfunc
1435
.endfunc
1369
1436
(-)xvidcore-1.1.3-old/src/image/x86_asm/interpolate8x8_xmm.asm (-24 / +37 lines)
Lines 42-61 BITS 32 Link Here
42
	%endif
42
	%endif
43
%endmacro
43
%endmacro
44
44
45
;=============================================================================
46
; Read only data
47
;=============================================================================
48
49
%ifdef FORMAT_COFF
50
SECTION .rodata
51
%else
52
SECTION .rodata align=16
53
%endif
54
55
ALIGN 16
56
mmx_one:
57
	times 8 db 1
58
59
SECTION .text
45
SECTION .text
60
46
61
cglobal interpolate8x8_halfpel_h_xmm
47
cglobal interpolate8x8_halfpel_h_xmm
Lines 132-138 interpolate8x8_halfpel_h_xmm: Link Here
132
118
133
.rounding1
119
.rounding1
134
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
120
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
135
  movq mm7, [mmx_one]
121
  push dword 0x01010101
122
  push dword 0x01010101
123
  movq mm7, [esp]
124
  add esp, byte 8
136
  COPY_H_SSE_RND1
125
  COPY_H_SSE_RND1
137
  lea ecx, [ecx+2*edx]
126
  lea ecx, [ecx+2*edx]
138
  COPY_H_SSE_RND1
127
  COPY_H_SSE_RND1
Lines 204-210 interpolate8x8_halfpel_v_xmm: Link Here
204
193
205
.rounding1
194
.rounding1
206
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
195
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
207
  movq mm7, [mmx_one]
196
  push dword 0x01010101
197
  push dword 0x01010101
198
  movq mm7, [esp]
199
  add esp, byte 8
208
  movq mm2, [eax]       ; loop invariant
200
  movq mm2, [eax]       ; loop invariant
209
  add eax, edx
201
  add eax, edx
210
202
Lines 326-332 interpolate8x8_halfpel_hv_xmm: Link Here
326
  mov eax, [esp+ 8]  ; Src
318
  mov eax, [esp+ 8]  ; Src
327
  mov edx, [esp+12]  ; stride
319
  mov edx, [esp+12]  ; stride
328
320
329
  movq mm7, [mmx_one]
321
  push dword 0x01010101
322
  push dword 0x01010101
323
  movq mm7, [esp]
324
  add esp, byte 8
330
325
331
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
326
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
332
  movq mm2, [eax]
327
  movq mm2, [eax]
Lines 384-390 interpolate8x4_halfpel_h_xmm: Link Here
384
379
385
.rounding1
380
.rounding1
386
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
381
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
387
  movq mm7, [mmx_one]
382
  push dword 0x01010101
383
  push dword 0x01010101
384
  movq mm7, [esp]
385
  add esp, byte 8
388
  COPY_H_SSE_RND1
386
  COPY_H_SSE_RND1
389
  lea ecx, [ecx+2*edx]
387
  lea ecx, [ecx+2*edx]
390
  COPY_H_SSE_RND1
388
  COPY_H_SSE_RND1
Lines 419-425 interpolate8x4_halfpel_v_xmm: Link Here
419
417
420
.rounding1
418
.rounding1
421
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
419
 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
422
  movq mm7, [mmx_one]
420
  push dword 0x01010101
421
  push dword 0x01010101
422
  movq mm7, [esp]
423
  add esp, byte 8
423
  movq mm2, [eax]       ; loop invariant
424
  movq mm2, [eax]       ; loop invariant
424
  add eax, edx
425
  add eax, edx
425
426
Lines 458-464 interpolate8x4_halfpel_hv_xmm: Link Here
458
  mov eax, [esp+ 8]  ; Src
459
  mov eax, [esp+ 8]  ; Src
459
  mov edx, [esp+12]  ; stride
460
  mov edx, [esp+12]  ; stride
460
461
461
  movq mm7, [mmx_one]
462
  push dword 0x01010101
463
  push dword 0x01010101
464
  movq mm7, [esp]
465
  add esp, byte 8
462
466
463
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
467
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
464
  movq mm2, [eax]
468
  movq mm2, [eax]
Lines 583-590 interpolate8x8_halfpel_add_xmm: ; 23c Link Here
583
    pxor mm2, mm4
587
    pxor mm2, mm4
584
    pavgb mm1, mm3
588
    pavgb mm1, mm3
585
    pxor mm3, mm5
589
    pxor mm3, mm5
586
    pand mm2, [mmx_one]
590
    pand mm2, [esp]
587
    pand mm3, [mmx_one]
591
    pand mm3, [esp]
588
    psubb mm0, mm2
592
    psubb mm0, mm2
589
    psubb mm1, mm3
593
    psubb mm1, mm3
590
    pavgb mm0, [ecx+%1]
594
    pavgb mm0, [ecx+%1]
Lines 612-617 interpolate8x8_halfpel_h_add_xmm: ; 32 Link Here
612
.Loop1
616
.Loop1
613
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
617
  ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1
614
  ; movq mm7, [mmx_one]
618
  ; movq mm7, [mmx_one]
619
  push dword 0x01010101
620
  push dword 0x01010101
615
  ADD_FH_RND1 0, edx
621
  ADD_FH_RND1 0, edx
616
  lea eax,[eax+2*edx]
622
  lea eax,[eax+2*edx]
617
  lea ecx,[ecx+2*edx]
623
  lea ecx,[ecx+2*edx]
Lines 622-627 interpolate8x8_halfpel_h_add_xmm: ; 32 Link Here
622
  lea eax,[eax+2*edx]
628
  lea eax,[eax+2*edx]
623
  lea ecx,[ecx+2*edx]
629
  lea ecx,[ecx+2*edx]
624
  ADD_FH_RND1 0, edx
630
  ADD_FH_RND1 0, edx
631
  add esp, byte 8
625
  EPILOG
632
  EPILOG
626
.endfunc
633
.endfunc
627
634
Lines 686-692 interpolate8x8_halfpel_v_add_xmm: Link Here
686
693
687
.Loop1
694
.Loop1
688
  movq mm0, [eax] ; loop invariant
695
  movq mm0, [eax] ; loop invariant
689
  movq mm7, [mmx_one]
696
  push dword 0x01010101
697
  push dword 0x01010101
698
  movq mm7, [esp]
699
  add esp, byte 8
690
700
691
  ADD_8_HF_RND1 
701
  ADD_8_HF_RND1 
692
  movq mm0, mm2
702
  movq mm0, mm2
Lines 809-815 ALIGN 16 Link Here
809
interpolate8x8_halfpel_hv_add_xmm:
819
interpolate8x8_halfpel_hv_add_xmm:
810
  PROLOG1
820
  PROLOG1
811
821
812
  movq mm7, [mmx_one]
822
  push dword 0x01010101
823
  push dword 0x01010101
824
  movq mm7, [esp]
813
825
814
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
826
    ; loop invariants: mm2=(i+j+1)/2  and  mm3= i^j
815
  movq mm2, [eax] 
827
  movq mm2, [eax] 
Lines 838-843 interpolate8x8_halfpel_hv_add_xmm: Link Here
838
  add ecx, edx
850
  add ecx, edx
839
  ADD_HH_RND1
851
  ADD_HH_RND1
840
852
853
  add esp, byte 8
841
  EPILOG
854
  EPILOG
842
.endfunc
855
.endfunc
843
856
(-)xvidcore-1.1.3-old/src/image/x86_asm/postprocessing_mmx.asm (-4 / +12 lines)
Lines 70-75 mmx_offset: Link Here
70
70
71
SECTION .text
71
SECTION .text
72
72
73
extern  _GLOBAL_OFFSET_TABLE_
74
get_pc.bp:
75
  mov ebp, [esp]
76
  retn
77
73
cglobal image_brightness_mmx
78
cglobal image_brightness_mmx
74
79
75
80
Lines 83-98 image_brightness_mmx: Link Here
83
	push esi
88
	push esi
84
	push edi
89
	push edi
85
90
86
	movq mm6, [mmx_0x80]
87
88
	mov eax, [esp+8+20] ; offset
91
	mov eax, [esp+8+20] ; offset
89
	movq mm7, [mmx_offset + (eax + 128)*8]   ; being lazy
90
91
	mov edx, [esp+8+4]  ; Dst
92
	mov edx, [esp+8+4]  ; Dst
92
	mov ecx, [esp+8+8]  ; stride
93
	mov ecx, [esp+8+8]  ; stride
93
	mov esi, [esp+8+12] ; width
94
	mov esi, [esp+8+12] ; width
94
	mov edi, [esp+8+16] ; height
95
	mov edi, [esp+8+16] ; height
95
96
97
	push ebp
98
	call get_pc.bp
99
	add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
100
	movq mm6, [ebp + mmx_0x80 wrt ..gotoff]
101
	movq mm7, [ebp + (eax + 128)*8 + mmx_offset wrt ..gotoff]   ; being lazy
102
	pop ebp
103
96
.yloop
104
.yloop
97
	xor	eax, eax
105
	xor	eax, eax
98
106
(-)xvidcore-1.1.3-old/src/image/x86_asm/postprocessing_sse2.asm (-33 / +10 lines)
Lines 42-60 BITS 32 Link Here
42
	%endif
42
	%endif
43
%endmacro
43
%endmacro
44
44
45
;===========================================================================
46
; read only data
47
;===========================================================================
48
49
%ifdef FORMAT_COFF
50
SECTION .rodata
51
%else
52
SECTION .rodata align=16
53
%endif
54
55
xmm_0x80:
56
	times 16 db 0x80
57
58
;=============================================================================
45
;=============================================================================
59
; Code
46
; Code
60
;=============================================================================
47
;=============================================================================
Lines 69-89 cglobal image_brightness_sse2 Link Here
69
56
70
%macro CREATE_OFFSET_VECTOR 2
57
%macro CREATE_OFFSET_VECTOR 2
71
  mov [%1 +  0], %2
58
  mov [%1 +  0], %2
72
  mov [%1 +  1], %2
73
  mov [%1 +  2], %2
74
  mov [%1 +  3], %2
75
  mov [%1 +  4], %2
59
  mov [%1 +  4], %2
76
  mov [%1 +  5], %2
77
  mov [%1 +  6], %2
78
  mov [%1 +  7], %2
79
  mov [%1 +  8], %2
60
  mov [%1 +  8], %2
80
  mov [%1 +  9], %2
81
  mov [%1 + 10], %2
82
  mov [%1 + 11], %2
83
  mov [%1 + 12], %2
61
  mov [%1 + 12], %2
84
  mov [%1 + 13], %2
85
  mov [%1 + 14], %2
86
  mov [%1 + 15], %2
87
%endmacro
62
%endmacro
88
63
89
ALIGN 16
64
ALIGN 16
Lines 93-107 image_brightness_sse2: Link Here
93
  push edi    ; 8 bytes offset for push
68
  push edi    ; 8 bytes offset for push
94
  sub esp, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16)
69
  sub esp, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16)
95
70
96
  movdqa xmm6, [xmm_0x80]
97
98
  ; Create a offset...offset vector
71
  ; Create a offset...offset vector
99
  mov eax, [esp+8+32+20] ; brightness offset value	
72
  movzx eax, byte [esp+8+32+20] ; brightness offset value
100
  mov edx, esp           ; edx will be esp aligned mod 16
73
  mov ecx, esp           ; ecx will be esp aligned mod 16
101
  add edx, 15            ; edx = esp + 15
74
  mov edx, 0x01010101
102
  and edx, ~15           ; edx = (esp + 15)&(~15)
75
  add ecx, 15            ; ecx = esp + 15
103
  CREATE_OFFSET_VECTOR edx, al
76
  mul edx
104
  movdqa xmm7, [edx]
77
  and ecx, ~15           ; ecx = (esp + 15)&(~15)
78
  CREATE_OFFSET_VECTOR ecx, dword 0x80808080
79
  movdqa xmm6, [ecx]
80
  CREATE_OFFSET_VECTOR ecx, eax
81
  movdqa xmm7, [ecx]
105
82
106
  mov edx, [esp+8+32+4]  ; Dst
83
  mov edx, [esp+8+32+4]  ; Dst
107
  mov ecx, [esp+8+32+8]  ; stride
84
  mov ecx, [esp+8+32+8]  ; stride
(-)xvidcore-1.1.3-old/src/image/x86_asm/qpel_mmx.asm (-163 / +172 lines)
Lines 201-206 FIR_C23: times 4 dw 23 Link Here
201
201
202
SECTION .text
202
SECTION .text
203
203
204
extern  _GLOBAL_OFFSET_TABLE_
205
get_pc.cx:
206
  mov ecx, [esp]
207
  retn
208
204
;//////////////////////////////////////////////////////////////////////
209
;//////////////////////////////////////////////////////////////////////
205
;// Here we go with the Q-Pel mess.
210
;// Here we go with the Q-Pel mess.
206
;//  For horizontal passes, we process 4 *output* pixel in parallel
211
;//  For horizontal passes, we process 4 *output* pixel in parallel
Lines 208-229 SECTION .text Link Here
208
;//////////////////////////////////////////////////////////////////////
213
;//////////////////////////////////////////////////////////////////////
209
214
210
%macro PROLOG_NO_AVRG 0
215
%macro PROLOG_NO_AVRG 0
216
  push ebx
211
  push esi
217
  push esi
212
  push edi
218
  push edi
213
  push ebp
219
  push ebp
214
  mov edi, [esp+16 + 0*4] ; Dst
220
  mov edi, [esp+20 + 0*4] ; Dst
215
  mov esi, [esp+16 + 1*4] ; Src
221
  mov esi, [esp+20 + 1*4] ; Src
216
  mov ecx, [esp+16 + 2*4] ; Size
222
  mov ebp, [esp+20 + 3*4] ; BpS
217
  mov ebp, [esp+16 + 3*4] ; BpS
223
  mov eax, [esp+20 + 4*4] ; Rnd
218
  mov eax, [esp+16 + 4*4] ; Rnd
219
  and eax, 1
224
  and eax, 1
220
  movq mm7, [Rounder_QP_MMX+eax*8]  ; rounder
225
  call get_pc.cx
226
  add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
227
  movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff]  ; rounder
221
%endmacro
228
%endmacro
222
229
223
%macro EPILOG_NO_AVRG 0
230
%macro EPILOG_NO_AVRG 0
224
  pop ebp
231
  pop ebp
225
  pop edi
232
  pop edi
226
  pop esi
233
  pop esi
234
  pop ebx
227
  ret
235
  ret
228
%endmacro
236
%endmacro
229
237
Lines 234-245 SECTION .text Link Here
234
  push ebp
242
  push ebp
235
  mov edi, [esp+20 + 0*4] ; Dst
243
  mov edi, [esp+20 + 0*4] ; Dst
236
  mov esi, [esp+20 + 1*4] ; Src
244
  mov esi, [esp+20 + 1*4] ; Src
237
  mov ecx, [esp+20 + 2*4] ; Size
238
  mov ebp, [esp+20 + 3*4] ; BpS
245
  mov ebp, [esp+20 + 3*4] ; BpS
239
  mov eax, [esp+20 + 4*4] ; Rnd
246
  mov eax, [esp+20 + 4*4] ; Rnd
240
  and eax, 1
247
  and eax, 1
241
  movq mm7, [Rounder_QP_MMX+eax*8]  ; rounder
248
  call get_pc.cx
242
  lea ebx, [Rounder1_MMX+eax*8]     ; *Rounder2
249
  add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
250
  movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff]  ; rounder
251
  lea ebx, [ecx + Rounder1_MMX+eax*8 wrt ..gotoff]     ; *Rounder2
243
%endmacro
252
%endmacro
244
253
245
%macro EPILOG_AVRG 0
254
%macro EPILOG_AVRG 0
Lines 261-283 SECTION .text Link Here
261
%macro TLOAD 2     ; %1,%2: src pixels
270
%macro TLOAD 2     ; %1,%2: src pixels
262
  movzx eax, byte [esi+%1]
271
  movzx eax, byte [esi+%1]
263
  movzx edx, byte [esi+%2]
272
  movzx edx, byte [esi+%2]
264
  movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ]
273
  movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff]
265
  movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ]
274
  movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff]
266
  paddw mm0, mm7
275
  paddw mm0, mm7
267
  paddw mm3, mm7
276
  paddw mm3, mm7
268
%endmacro
277
%endmacro
269
278
270
%macro TACCUM2 5   ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs
279
%macro TACCUM2 5   ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs
271
  movzx eax, byte [esi+%1]
280
  movzx eax, byte [esi+%1]
272
  paddw %4, [%2 + eax*8]
281
  paddw %4, [eax*8 + %2]
273
  paddw %5, [%3 + eax*8]
282
  paddw %5, [eax*8 + %3]
274
%endmacro
283
%endmacro
275
284
276
%macro TACCUM3 7   ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs
285
%macro TACCUM3 7   ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs
277
  movzx eax, byte [esi+%1]
286
  movzx eax, byte [esi+%1]
278
  paddw %5, [%2 + eax*8]
287
  paddw %5, [eax*8 + %2]
279
  paddw %6, [%3 + eax*8]
288
  paddw %6, [eax*8 + %3]
280
  paddw %7, [%4 + eax*8]
289
  paddw %7, [eax*8 + %4]
281
%endmacro
290
%endmacro
282
291
283
;//////////////////////////////////////////////////////////////////////
292
;//////////////////////////////////////////////////////////////////////
Lines 287-318 SECTION .text Link Here
287
%macro LOAD 2     ; %1,%2: src pixels
296
%macro LOAD 2     ; %1,%2: src pixels
288
  movzx eax, byte [esi+%1]
297
  movzx eax, byte [esi+%1]
289
  movzx edx, byte [esi+%2]
298
  movzx edx, byte [esi+%2]
290
  movq mm0, [xvid_Expand_mmx + eax*8]
299
  movq mm0, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
291
  movq mm3, [xvid_Expand_mmx + edx*8]
300
  movq mm3, [ecx + xvid_Expand_mmx + edx*8 wrt ..gotoff]
292
  pmullw mm0, [FIR_R0 ]
301
  pmullw mm0, [ecx + FIR_R0  wrt ..gotoff]
293
  pmullw mm3, [FIR_R16]
302
  pmullw mm3, [ecx + FIR_R16 wrt ..gotoff]
294
  paddw mm0, mm7
303
  paddw mm0, mm7
295
  paddw mm3, mm7
304
  paddw mm3, mm7
296
%endmacro
305
%endmacro
297
306
298
%macro ACCUM2 4   ;src pixel/Taps/dst regs #1-#2
307
%macro ACCUM2 4   ;src pixel/Taps/dst regs #1-#2
299
  movzx eax, byte [esi+%1]
308
  movzx eax, byte [esi+%1]
300
  movq mm4, [xvid_Expand_mmx + eax*8]
309
  movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
301
  movq mm5, mm4
310
  movq mm5, mm4
302
  pmullw mm4, [%2]
311
  pmullw mm4, [%2]
303
  pmullw mm5, [%2+8]
312
  pmullw mm5, [8+%2]
304
  paddw %3, mm4
313
  paddw %3, mm4
305
  paddw %4, mm5
314
  paddw %4, mm5
306
%endmacro
315
%endmacro
307
316
308
%macro ACCUM3 5   ;src pixel/Taps/dst regs #1-#2-#3
317
%macro ACCUM3 5   ;src pixel/Taps/dst regs #1-#2-#3
309
  movzx eax, byte [esi+%1]
318
  movzx eax, byte [esi+%1]
310
  movq mm4, [xvid_Expand_mmx + eax*8]
319
  movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff]
311
  movq mm5, mm4
320
  movq mm5, mm4
312
  movq mm6, mm5
321
  movq mm6, mm5
313
  pmullw mm4, [%2   ]
322
  pmullw mm4, [   %2]
314
  pmullw mm5, [%2+ 8]
323
  pmullw mm5, [ 8+%2]
315
  pmullw mm6, [%2+16]
324
  pmullw mm6, [16+%2]
316
  paddw %3, mm4
325
  paddw %3, mm4
317
  paddw %4, mm5
326
  paddw %4, mm5
318
  paddw %5, mm6
327
  paddw %5, mm6
Lines 359-381 SECTION .text Link Here
359
  movq mm1, mm7
368
  movq mm1, mm7
360
  movq mm2, mm7
369
  movq mm2, mm7
361
370
362
  ACCUM2 1,    FIR_R1, mm0, mm1
371
  ACCUM2 1,    ecx + FIR_R1 wrt ..gotoff, mm0, mm1
363
  ACCUM2 2,    FIR_R2, mm0, mm1
372
  ACCUM2 2,    ecx + FIR_R2 wrt ..gotoff, mm0, mm1
364
  ACCUM2 3,    FIR_R3, mm0, mm1
373
  ACCUM2 3,    ecx + FIR_R3 wrt ..gotoff, mm0, mm1
365
  ACCUM2 4,    FIR_R4, mm0, mm1
374
  ACCUM2 4,    ecx + FIR_R4 wrt ..gotoff, mm0, mm1
366
375
367
  ACCUM3 5,    FIR_R5, mm0, mm1, mm2
376
  ACCUM3 5,    ecx + FIR_R5 wrt ..gotoff, mm0, mm1, mm2
368
  ACCUM3 6,    FIR_R6, mm0, mm1, mm2
377
  ACCUM3 6,    ecx + FIR_R6 wrt ..gotoff, mm0, mm1, mm2
369
  ACCUM3 7,    FIR_R7, mm0, mm1, mm2
378
  ACCUM3 7,    ecx + FIR_R7 wrt ..gotoff, mm0, mm1, mm2
370
  ACCUM2 8,    FIR_R8, mm1, mm2
379
  ACCUM2 8,    ecx + FIR_R8 wrt ..gotoff, mm1, mm2
371
  ACCUM3 9,    FIR_R9, mm1, mm2, mm3
380
  ACCUM3 9,    ecx + FIR_R9 wrt ..gotoff, mm1, mm2, mm3
372
  ACCUM3 10,   FIR_R10,mm1, mm2, mm3
381
  ACCUM3 10,   ecx + FIR_R10 wrt ..gotoff,mm1, mm2, mm3
373
  ACCUM3 11,   FIR_R11,mm1, mm2, mm3
382
  ACCUM3 11,   ecx + FIR_R11 wrt ..gotoff,mm1, mm2, mm3
374
383
375
  ACCUM2 12,   FIR_R12, mm2, mm3
384
  ACCUM2 12,   ecx + FIR_R12 wrt ..gotoff, mm2, mm3
376
  ACCUM2 13,   FIR_R13, mm2, mm3
385
  ACCUM2 13,   ecx + FIR_R13 wrt ..gotoff, mm2, mm3
377
  ACCUM2 14,   FIR_R14, mm2, mm3
386
  ACCUM2 14,   ecx + FIR_R14 wrt ..gotoff, mm2, mm3
378
  ACCUM2 15,   FIR_R15, mm2, mm3
387
  ACCUM2 15,   ecx + FIR_R15 wrt ..gotoff, mm2, mm3
379
388
380
%else
389
%else
381
390
Lines 383-407 SECTION .text Link Here
383
  movq mm1, mm7
392
  movq mm1, mm7
384
  movq mm2, mm7
393
  movq mm2, mm7
385
394
386
  TACCUM2 1,    xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1
395
  TACCUM2 1,   ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1
387
  TACCUM2 2,    xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1
396
  TACCUM2 2,   ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1
388
  TACCUM2 3,    xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1
397
  TACCUM2 3,   ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1
389
  TACCUM2 4,    xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1
398
  TACCUM2 4,   ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff, mm0, mm1
390
399
391
  TACCUM3 5,    xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0  , mm0, mm1, mm2
400
  TACCUM3 5,   ecx + xvid_FIR_0_1_3_6 wrt ..gotoff  , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff  , mm0, mm1, mm2
392
  TACCUM3 6,    xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0  , mm0, mm1, mm2
401
  TACCUM3 6,   ecx + xvid_FIR_0_0_1_3 wrt ..gotoff  , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff  , mm0, mm1, mm2
393
  TACCUM3 7,    xvid_FIR_0_0_0_1  , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0  , mm0, mm1, mm2
402
  TACCUM3 7,   ecx + xvid_FIR_0_0_0_1 wrt ..gotoff  , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff  , mm0, mm1, mm2
394
403
395
  TACCUM2 8,                       xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 ,      mm1, mm2
404
  TACCUM2 8,   ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm1, mm2
396
405
397
  TACCUM3 9,                       xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0,  mm1, mm2, mm3
406
  TACCUM3 9,   ecx + xvid_FIR_0_1_3_6 wrt ..gotoff  , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff,  mm1, mm2, mm3
398
  TACCUM3 10,                      xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0,  mm1, mm2, mm3
407
  TACCUM3 10,  ecx + xvid_FIR_0_0_1_3 wrt ..gotoff  , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff,  mm1, mm2, mm3
399
  TACCUM3 11,                      xvid_FIR_0_0_0_1  , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0,  mm1, mm2, mm3
408
  TACCUM3 11,  ecx + xvid_FIR_0_0_0_1 wrt ..gotoff  , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff,  mm1, mm2, mm3
400
409
401
  TACCUM2 12,  xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3
410
  TACCUM2 12,  ecx + xvid_FIR_1_3_6_20 wrt ..gotoff, ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm2, mm3
402
  TACCUM2 13,  xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3
411
  TACCUM2 13,  ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm2, mm3
403
  TACCUM2 14,  xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3
412
  TACCUM2 14,  ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm2, mm3
404
  TACCUM2 15,  xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3
413
  TACCUM2 15,  ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm2, mm3
405
414
406
%endif
415
%endif
407
416
Lines 418-424 SECTION .text Link Here
418
  MIX mm0, esi+1, ebx
427
  MIX mm0, esi+1, ebx
419
%endif
428
%endif
420
%if (%2==1)
429
%if (%2==1)
421
  MIX mm0, edi, Rounder1_MMX
430
  MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff
422
%endif
431
%endif
423
432
424
%if (%1==1)
433
%if (%1==1)
Lines 427-433 SECTION .text Link Here
427
  MIX mm2, esi+9, ebx
436
  MIX mm2, esi+9, ebx
428
%endif
437
%endif
429
%if (%2==1)
438
%if (%2==1)
430
  MIX mm2, edi+8, Rounder1_MMX
439
  MIX mm2, edi+8, ecx + Rounder1_MMX wrt ..gotoff
431
%endif
440
%endif
432
441
433
  lea esi, [esi+ebp]
442
  lea esi, [esi+ebp]
Lines 436-442 SECTION .text Link Here
436
  movq [edi+8], mm2
445
  movq [edi+8], mm2
437
446
438
  add edi, ebp
447
  add edi, ebp
439
  dec ecx
448
  dec dword [esp+20 + 2*4]
440
  jg .Loop
449
  jg .Loop
441
450
442
%if (%2==0) && (%1==0)
451
%if (%2==0) && (%1==0)
Lines 464-527 SECTION .text Link Here
464
%ifndef USE_TABLES
473
%ifndef USE_TABLES
465
474
466
  LOAD 0, 8  ; special case for 1rst/last pixel
475
  LOAD 0, 8  ; special case for 1rst/last pixel
467
  ACCUM2 1,  FIR_R1,  mm0, mm3
476
  ACCUM2 1,  ecx + FIR_R1 wrt ..gotoff,  mm0, mm3
468
  ACCUM2 2,  FIR_R2,  mm0, mm3
477
  ACCUM2 2,  ecx + FIR_R2 wrt ..gotoff,  mm0, mm3
469
  ACCUM2 3,  FIR_R3,  mm0, mm3
478
  ACCUM2 3,  ecx + FIR_R3 wrt ..gotoff,  mm0, mm3
470
  ACCUM2 4,  FIR_R4,  mm0, mm3
479
  ACCUM2 4,  ecx + FIR_R4 wrt ..gotoff,  mm0, mm3
471
480
472
  ACCUM2 5,  FIR_R13,  mm0, mm3
481
  ACCUM2 5,  ecx + FIR_R13 wrt ..gotoff,  mm0, mm3
473
  ACCUM2 6,  FIR_R14,  mm0, mm3
482
  ACCUM2 6,  ecx + FIR_R14 wrt ..gotoff,  mm0, mm3
474
  ACCUM2 7,  FIR_R15,  mm0, mm3
483
  ACCUM2 7,  ecx + FIR_R15 wrt ..gotoff,  mm0, mm3
475
484
476
%else
485
%else
477
486
478
%if 0   ; test with no unrolling
487
%if 0   ; test with no unrolling
479
488
480
  TLOAD 0, 8  ; special case for 1rst/last pixel
489
  TLOAD 0, 8  ; special case for 1rst/last pixel
481
  TACCUM2 1,  xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0  , mm0, mm3
490
  TACCUM2 1,  ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff  , mm0, mm3
482
  TACCUM2 2,  xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0  , mm0, mm3
491
  TACCUM2 2,  ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff  , mm0, mm3
483
  TACCUM2 3,  xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0  , mm0, mm3
492
  TACCUM2 3,  ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff  , mm0, mm3
484
  TACCUM2 4,  xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3
493
  TACCUM2 4,  ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm0, mm3
485
  TACCUM2 5,  xvid_FIR_0_1_3_6  , xvid_FIR_20_20_6_3, mm0, mm3
494
  TACCUM2 5,  ecx + xvid_FIR_0_1_3_6 wrt ..gotoff  , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm0, mm3
486
  TACCUM2 6,  xvid_FIR_0_0_1_3  , xvid_FIR_6_20_20_7, mm0, mm3
495
  TACCUM2 6,  ecx + xvid_FIR_0_0_1_3 wrt ..gotoff  , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm0, mm3
487
  TACCUM2 7,  xvid_FIR_0_0_0_1  , xvid_FIR_3_6_19_23, mm0, mm3
496
  TACCUM2 7,  ecx + xvid_FIR_0_0_0_1 wrt ..gotoff  , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm0, mm3
488
497
489
%else  ; test with unrolling (little faster, but not much)
498
%else  ; test with unrolling (little faster, but not much)
490
499
491
  movzx eax, byte [esi]
500
  movzx eax, byte [esi]
492
  movzx edx, byte [esi+8]
501
  movzx edx, byte [esi+8]
493
  movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ]
502
  movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff]
494
  movzx eax, byte [esi+1]
503
  movzx eax, byte [esi+1]
495
  movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ]
504
  movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff]
496
  paddw mm0, mm7
505
  paddw mm0, mm7
497
  paddw mm3, mm7
506
  paddw mm3, mm7
498
507
499
  movzx edx, byte [esi+2]
508
  movzx edx, byte [esi+2]
500
  paddw mm0, [xvid_FIR_23_19_6_3 + eax*8]
509
  paddw mm0, [ecx + xvid_FIR_23_19_6_3 + eax*8 wrt ..gotoff]
501
  paddw mm3, [xvid_FIR_1_0_0_0 + eax*8]
510
  paddw mm3, [ecx + xvid_FIR_1_0_0_0 + eax*8 wrt ..gotoff]
502
511
503
  movzx eax, byte [esi+3]
512
  movzx eax, byte [esi+3]
504
  paddw mm0, [xvid_FIR_7_20_20_6 + edx*8]
513
  paddw mm0, [ecx + xvid_FIR_7_20_20_6 + edx*8 wrt ..gotoff]
505
  paddw mm3, [xvid_FIR_3_1_0_0 + edx*8]
514
  paddw mm3, [ecx + xvid_FIR_3_1_0_0 + edx*8 wrt ..gotoff]
506
515
507
  movzx edx, byte [esi+4]
516
  movzx edx, byte [esi+4]
508
  paddw mm0, [xvid_FIR_3_6_20_20 + eax*8]
517
  paddw mm0, [ecx + xvid_FIR_3_6_20_20 + eax*8 wrt ..gotoff]
509
  paddw mm3, [xvid_FIR_6_3_1_0 + eax*8]
518
  paddw mm3, [ecx + xvid_FIR_6_3_1_0 + eax*8 wrt ..gotoff]
510
519
511
  movzx eax, byte [esi+5]
520
  movzx eax, byte [esi+5]
512
  paddw mm0, [xvid_FIR_1_3_6_20 + edx*8]
521
  paddw mm0, [ecx + xvid_FIR_1_3_6_20 + edx*8 wrt ..gotoff]
513
  paddw mm3, [xvid_FIR_20_6_3_1 + edx*8]
522
  paddw mm3, [ecx + xvid_FIR_20_6_3_1 + edx*8 wrt ..gotoff]
514
523
515
  movzx edx, byte [esi+6]
524
  movzx edx, byte [esi+6]
516
  paddw mm0, [xvid_FIR_0_1_3_6 + eax*8]
525
  paddw mm0, [ecx + xvid_FIR_0_1_3_6 + eax*8 wrt ..gotoff]
517
  paddw mm3, [xvid_FIR_20_20_6_3 + eax*8]
526
  paddw mm3, [ecx + xvid_FIR_20_20_6_3 + eax*8 wrt ..gotoff]
518
527
519
  movzx eax, byte [esi+7]
528
  movzx eax, byte [esi+7]
520
  paddw mm0, [xvid_FIR_0_0_1_3 + edx*8]
529
  paddw mm0, [ecx + xvid_FIR_0_0_1_3 + edx*8 wrt ..gotoff]
521
  paddw mm3, [xvid_FIR_6_20_20_7 + edx*8]
530
  paddw mm3, [ecx + xvid_FIR_6_20_20_7 + edx*8 wrt ..gotoff]
522
531
523
  paddw mm0, [xvid_FIR_0_0_0_1 + eax*8]
532
  paddw mm0, [ecx + xvid_FIR_0_0_0_1 + eax*8 wrt ..gotoff]
524
  paddw mm3, [xvid_FIR_3_6_19_23 + eax*8]
533
  paddw mm3, [ecx + xvid_FIR_3_6_19_23 + eax*8 wrt ..gotoff]
525
534
526
%endif
535
%endif
527
536
Lines 537-550 SECTION .text Link Here
537
  MIX mm0, esi+1, ebx
546
  MIX mm0, esi+1, ebx
538
%endif
547
%endif
539
%if (%2==1)
548
%if (%2==1)
540
  MIX mm0, edi, Rounder1_MMX
549
  MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff
541
%endif
550
%endif
542
551
543
  movq [edi], mm0
552
  movq [edi], mm0
544
553
545
  add edi, ebp
554
  add edi, ebp
546
  add esi, ebp
555
  add esi, ebp
547
  dec ecx
556
  dec dword [esp+20 + 2*4]
548
  jg .Loop
557
  jg .Loop
549
558
550
%if (%2==0) && (%1==0)
559
%if (%2==0) && (%1==0)
Lines 678-684 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
678
  V_MIX %3, esi, ebx
687
  V_MIX %3, esi, ebx
679
%endif
688
%endif
680
%if (%2==1)
689
%if (%2==1)
681
  V_MIX %3, edi, Rounder1_MMX
690
  V_MIX %3, edi, ecx + Rounder1_MMX wrt ..gotoff
682
%endif
691
%endif
683
692
684
  movd eax, %3
693
  movd eax, %3
Lines 718-745 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
718
  movq mm3, mm7
727
  movq mm3, mm7
719
728
720
  V_LOAD 0
729
  V_LOAD 0
721
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2,  FIR_Cm1
730
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
722
  V_LOAD 0
731
  V_LOAD 0
723
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
732
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
724
  V_LOAD 0
733
  V_LOAD 0
725
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
734
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
726
  V_LOAD 0
735
  V_LOAD 0
727
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
736
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
728
  V_LOAD 0
737
  V_LOAD 0
729
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
738
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
730
  V_STORE %1, %2, mm0, 0
739
  V_STORE %1, %2, mm0, 0
731
740
732
  V_LOAD 0
741
  V_LOAD 0
733
  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
742
  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
734
  V_ACC1 mm3, FIR_Cm6
743
  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
735
  V_STORE %1, %2, mm1, 0
744
  V_STORE %1, %2, mm1, 0
736
745
737
  V_LOAD 0
746
  V_LOAD 0
738
  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
747
  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
739
  V_STORE %1, %2, mm2, 0
748
  V_STORE %1, %2, mm2, 0
740
749
741
  V_LOAD 1
750
  V_LOAD 1
742
  V_ACC1 mm3, FIR_Cm1
751
  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
743
  V_STORE %1, %2, mm3, 0
752
  V_STORE %1, %2, mm3, 0
744
753
745
    ; ouput rows [4..7], from input rows [1..11] (!!)
754
    ; ouput rows [4..7], from input rows [1..11] (!!)
Lines 756-793 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
756
  movq mm3, mm7
765
  movq mm3, mm7
757
766
758
  V_LOAD 0
767
  V_LOAD 0
759
  V_ACC1 mm0, FIR_Cm1
768
  V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff
760
769
761
  V_LOAD 0
770
  V_LOAD 0
762
  V_ACC2l mm0, mm1, FIR_C3,  FIR_Cm1
771
  V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
763
772
764
  V_LOAD 0
773
  V_LOAD 0
765
  V_ACC2 mm0, mm1, FIR_Cm6,  FIR_C3
774
  V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
766
  V_ACC1 mm2, FIR_Cm1
775
  V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff
767
776
768
  V_LOAD 0
777
  V_LOAD 0
769
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
778
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
770
  V_LOAD 0
779
  V_LOAD 0
771
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
780
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
772
  V_LOAD 0
781
  V_LOAD 0
773
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
782
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
774
  V_LOAD 0
783
  V_LOAD 0
775
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
784
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
776
  V_LOAD 0
785
  V_LOAD 0
777
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
786
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
778
  V_STORE %1, %2, mm0, 0
787
  V_STORE %1, %2, mm0, 0
779
788
780
  V_LOAD 0
789
  V_LOAD 0
781
  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
790
  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
782
  V_ACC1 mm3, FIR_Cm6
791
  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
783
  V_STORE %1, %2, mm1, 0
792
  V_STORE %1, %2, mm1, 0
784
793
785
  V_LOAD 0
794
  V_LOAD 0
786
  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
795
  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
787
  V_STORE %1, %2, mm2, 0
796
  V_STORE %1, %2, mm2, 0
788
797
789
  V_LOAD 1
798
  V_LOAD 1
790
  V_ACC1 mm3, FIR_Cm1
799
  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
791
  V_STORE %1, %2, mm3, 0
800
  V_STORE %1, %2, mm3, 0
792
801
793
    ; ouput rows [8..11], from input rows [5..15]
802
    ; ouput rows [8..11], from input rows [5..15]
Lines 804-842 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
804
  movq mm3, mm7
813
  movq mm3, mm7
805
814
806
  V_LOAD 0
815
  V_LOAD 0
807
  V_ACC1 mm0, FIR_Cm1
816
  V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff
808
817
809
  V_LOAD 0
818
  V_LOAD 0
810
  V_ACC2l mm0, mm1, FIR_C3,  FIR_Cm1
819
  V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
811
820
812
  V_LOAD 0
821
  V_LOAD 0
813
  V_ACC2 mm0, mm1, FIR_Cm6,  FIR_C3
822
  V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
814
  V_ACC1 mm2, FIR_Cm1
823
  V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff
815
824
816
  V_LOAD 0
825
  V_LOAD 0
817
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1
826
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
818
  V_LOAD 0
827
  V_LOAD 0
819
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3
828
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
820
  V_LOAD 0
829
  V_LOAD 0
821
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6
830
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
822
  V_LOAD 0
831
  V_LOAD 0
823
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
832
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
824
  V_LOAD 0
833
  V_LOAD 0
825
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
834
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
826
835
827
  V_STORE %1, %2, mm0, 0
836
  V_STORE %1, %2, mm0, 0
828
837
829
  V_LOAD 0
838
  V_LOAD 0
830
  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
839
  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
831
  V_ACC1 mm3, FIR_Cm6
840
  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
832
  V_STORE %1, %2, mm1, 0
841
  V_STORE %1, %2, mm1, 0
833
842
834
  V_LOAD 0
843
  V_LOAD 0
835
  V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3
844
  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
836
  V_STORE %1, %2, mm2, 0
845
  V_STORE %1, %2, mm2, 0
837
846
838
  V_LOAD 1
847
  V_LOAD 1
839
  V_ACC1 mm3, FIR_Cm1
848
  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
840
  V_STORE %1, %2, mm3, 0
849
  V_STORE %1, %2, mm3, 0
841
850
842
851
Lines 855-879 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
855
  movq mm3, mm7
864
  movq mm3, mm7
856
865
857
  V_LOAD 0
866
  V_LOAD 0
858
  V_ACC1 mm3, FIR_Cm1
867
  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
859
868
860
  V_LOAD 0
869
  V_LOAD 0
861
  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
870
  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
862
871
863
  V_LOAD 0
872
  V_LOAD 0
864
  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
873
  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
865
  V_ACC1 mm3, FIR_Cm6
874
  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
866
875
867
  V_LOAD 0
876
  V_LOAD 0
868
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
877
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
869
  V_LOAD 0
878
  V_LOAD 0
870
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
879
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
871
  V_LOAD 0
880
  V_LOAD 0
872
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
881
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
873
  V_LOAD 0
882
  V_LOAD 0
874
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
883
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
875
  V_LOAD 1
884
  V_LOAD 1
876
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
885
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
877
886
878
  V_STORE %1, %2, mm3, 0
887
  V_STORE %1, %2, mm3, 0
879
  V_STORE %1, %2, mm2, 0
888
  V_STORE %1, %2, mm2, 0
Lines 886-892 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
886
  pop edi
895
  pop edi
887
  add esi, 4
896
  add esi, 4
888
  add edi, 4
897
  add edi, 4
889
  sub ecx, 4
898
  sub dword [esp+20 + 2*4], 4
890
  jg .Loop
899
  jg .Loop
891
900
892
%if (%2==0) && (%1==0)
901
%if (%2==0) && (%1==0)
Lines 924-952 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
924
  movq mm3, mm7
933
  movq mm3, mm7
925
934
926
  V_LOAD 0
935
  V_LOAD 0
927
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2,  FIR_Cm1
936
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff,  ecx + FIR_Cm1 wrt ..gotoff
928
  V_LOAD 0
937
  V_LOAD 0
929
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
938
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
930
  V_LOAD 0
939
  V_LOAD 0
931
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
940
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
932
  V_LOAD 0
941
  V_LOAD 0
933
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
942
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
934
  V_LOAD 0
943
  V_LOAD 0
935
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
944
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
936
  V_STORE %1, %2, mm0, 0
945
  V_STORE %1, %2, mm0, 0
937
946
938
  V_LOAD 0
947
  V_LOAD 0
939
  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
948
  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
940
  V_ACC1 mm3, FIR_Cm6
949
  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
941
950
942
  V_STORE %1, %2, mm1, 0
951
  V_STORE %1, %2, mm1, 0
943
952
944
  V_LOAD 0
953
  V_LOAD 0
945
  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
954
  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
946
  V_STORE %1, %2, mm2, 0
955
  V_STORE %1, %2, mm2, 0
947
956
948
  V_LOAD 1
957
  V_LOAD 1
949
  V_ACC1 mm3, FIR_Cm1
958
  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
950
  V_STORE %1, %2, mm3, 0
959
  V_STORE %1, %2, mm3, 0
951
960
952
    ; ouput rows [4..7], from input rows [1..9]
961
    ; ouput rows [4..7], from input rows [1..9]
Lines 964-988 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
964
  movq mm3, mm7
973
  movq mm3, mm7
965
974
966
  V_LOAD 0
975
  V_LOAD 0
967
  V_ACC1 mm3, FIR_Cm1
976
  V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff
968
977
969
  V_LOAD 0
978
  V_LOAD 0
970
  V_ACC2l mm2, mm3, FIR_Cm1,  FIR_C3
979
  V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
971
980
972
  V_LOAD 0
981
  V_LOAD 0
973
  V_ACC2 mm1, mm2, FIR_Cm1,  FIR_C3
982
  V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff,  ecx + FIR_C3 wrt ..gotoff
974
  V_ACC1 mm3, FIR_Cm6
983
  V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff
975
984
976
  V_LOAD 0
985
  V_LOAD 0
977
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3,  FIR_Cm6, FIR_C20
986
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
978
  V_LOAD 0
987
  V_LOAD 0
979
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C3,  FIR_Cm6, FIR_C20, FIR_C20
988
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff,  ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff
980
  V_LOAD 0
989
  V_LOAD 0
981
  V_ACC4  mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6
990
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff
982
  V_LOAD 0
991
  V_LOAD 0
983
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3
992
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff
984
  V_LOAD 1
993
  V_LOAD 1
985
  V_ACC4  mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1
994
  V_ACC4  mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff
986
995
987
  V_STORE %1, %2, mm3, 0
996
  V_STORE %1, %2, mm3, 0
988
  V_STORE %1, %2, mm2, 0
997
  V_STORE %1, %2, mm2, 0
Lines 995-1001 xvid_H_Pass_Avrg_Up_8_Add_mmx: Link Here
995
  pop edi
1004
  pop edi
996
  add esi, 4
1005
  add esi, 4
997
  add edi, 4
1006
  add edi, 4
998
  sub ecx, 4
1007
  sub dword [esp+20 + 2*4], 4
999
  jg .Loop
1008
  jg .Loop
1000
1009
1001
%if (%2==0) && (%1==0)
1010
%if (%2==0) && (%1==0)
(-)xvidcore-1.1.3-old/src/image/x86_asm/reduced_mmx.asm (-61 / +99 lines)
Lines 91-98 cglobal xvid_Filter_Diff_18x18_To_8x8_mm Link Here
91
  pmullw mm4, %4 ; [Up31]
91
  pmullw mm4, %4 ; [Up31]
92
  pmullw %2,  %3 ; [Up13]
92
  pmullw %2,  %3 ; [Up13]
93
  pmullw mm5, %4 ; [Up31]
93
  pmullw mm5, %4 ; [Up31]
94
  paddsw %1, [Cst2]
94
  paddsw %1, [ebp + Cst2 wrt ..gotoff]
95
  paddsw %2, [Cst2]
95
  paddsw %2, [ebp + Cst2 wrt ..gotoff]
96
  paddsw %1, mm4
96
  paddsw %1, mm4
97
  paddsw %2, mm5
97
  paddsw %2, mm5
98
%endmacro
98
%endmacro
Lines 126-139 cglobal xvid_Filter_Diff_18x18_To_8x8_mm Link Here
126
126
127
%macro MIX_ROWS 4   ; %1/%2:prev %3/4:cur (preserved)  mm4/mm5: output
127
%macro MIX_ROWS 4   ; %1/%2:prev %3/4:cur (preserved)  mm4/mm5: output
128
  ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved.
128
  ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved.
129
  movq mm4, [Cst3]
129
  movq mm4, [ebp + Cst3 wrt ..gotoff]
130
  movq mm5, [Cst3]
130
  movq mm5, [ebp + Cst3 wrt ..gotoff]
131
  pmullw mm4, %3
131
  pmullw mm4, %3
132
  pmullw mm5, %4
132
  pmullw mm5, %4
133
  paddsw mm4, %1
133
  paddsw mm4, %1
134
  paddsw mm5, %2
134
  paddsw mm5, %2
135
  pmullw %1, [Cst3]
135
  pmullw %1, [ebp + Cst3 wrt ..gotoff]
136
  pmullw %2, [Cst3]
136
  pmullw %2, [ebp + Cst3 wrt ..gotoff]
137
  paddsw %1, %3
137
  paddsw %1, %3
138
  paddsw %2, %4
138
  paddsw %2, %4
139
%endmacro
139
%endmacro
Lines 176-183 xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 Link Here
176
  mov edx, [esp+8]  ; Src
176
  mov edx, [esp+8]  ; Src
177
  mov eax, [esp+12] ; BpS
177
  mov eax, [esp+12] ; BpS
178
178
179
  movq mm6, [Up13]
179
  push ebp
180
  movq mm7, [Up31]
180
  call get_pc.bp
181
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
182
183
  movq mm6, [ebp + Up13 wrt ..gotoff]
184
  movq mm7, [ebp + Up31 wrt ..gotoff]
181
185
182
  COL03 mm0, mm1, 0
186
  COL03 mm0, mm1, 0
183
  MUL_PACK mm0,mm1, mm6, mm7
187
  MUL_PACK mm0,mm1, mm6, mm7
Lines 223-229 xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 Link Here
223
227
224
  STORE_1 mm2, mm3
228
  STORE_1 mm2, mm3
225
229
226
  mov ecx, [esp+4]
230
  mov ecx, [esp+8]
227
  add ecx, 8
231
  add ecx, 8
228
232
229
  COL47 mm0, mm1, 0
233
  COL47 mm0, mm1, 0
Lines 270-275 xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 Link Here
270
274
271
  STORE_1 mm2, mm3
275
  STORE_1 mm2, mm3
272
276
277
  pop ebp
273
  ret
278
  ret
274
.endfunc
279
.endfunc
275
280
Lines 292-299 xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 Link Here
292
    ;         (x*4 + 2)/4 = x - (x<0)
297
    ;         (x*4 + 2)/4 = x - (x<0)
293
    ; So, better revert to (x*4)/4 = x.
298
    ; So, better revert to (x*4)/4 = x.
294
299
295
  psubsw %1, [Cst2000]
300
  psubsw %1, [ebp + Cst2000 wrt ..gotoff]
296
  psubsw %2, [Cst0002]
301
  psubsw %2, [ebp + Cst0002 wrt ..gotoff]
297
  pxor mm6, mm6
302
  pxor mm6, mm6
298
  pxor mm7, mm7
303
  pxor mm7, mm7
299
  pcmpgtw mm6, %1
304
  pcmpgtw mm6, %1
Lines 308-315 xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 Link Here
308
    ; mix with destination [ecx]
313
    ; mix with destination [ecx]
309
  movq mm6, [ecx]
314
  movq mm6, [ecx]
310
  movq mm7, [ecx]
315
  movq mm7, [ecx]
311
  punpcklbw mm6, [Cst0]
316
  punpcklbw mm6, [ebp + Cst0 wrt ..gotoff]
312
  punpckhbw mm7, [Cst0]
317
  punpckhbw mm7, [ebp + Cst0 wrt ..gotoff]
313
  paddsw %1, mm6
318
  paddsw %1, mm6
314
  paddsw %2, mm7
319
  paddsw %2, mm7
315
  packuswb %1,%2
320
  packuswb %1,%2
Lines 342-357 xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 Link Here
342
    ; mix with destination
347
    ; mix with destination
343
  movq mm6, [ecx]
348
  movq mm6, [ecx]
344
  movq mm7, [ecx]
349
  movq mm7, [ecx]
345
  punpcklbw mm6, [Cst0]
350
  punpcklbw mm6, [ebp + Cst0 wrt ..gotoff]
346
  punpckhbw mm7, [Cst0]
351
  punpckhbw mm7, [ebp + Cst0 wrt ..gotoff]
347
  paddsw %1, mm6
352
  paddsw %1, mm6
348
  paddsw %2, mm7
353
  paddsw %2, mm7
349
354
350
  movq mm6, [ecx+eax]
355
  movq mm6, [ecx+eax]
351
  movq mm7, [ecx+eax]
356
  movq mm7, [ecx+eax]
352
357
353
  punpcklbw mm6, [Cst0]
358
  punpcklbw mm6, [ebp + Cst0 wrt ..gotoff]
354
  punpckhbw mm7, [Cst0]
359
  punpckhbw mm7, [ebp + Cst0 wrt ..gotoff]
355
  paddsw mm4, mm6
360
  paddsw mm4, mm6
356
  paddsw mm5, mm7
361
  paddsw mm5, mm7
357
362
Lines 373-470 xvid_Add_Upsampled_8x8_16To8_mmx: ; 579 Link Here
373
  mov edx, [esp+8]  ; Src
378
  mov edx, [esp+8]  ; Src
374
  mov eax, [esp+12] ; BpS
379
  mov eax, [esp+12] ; BpS
375
380
381
  push ebp
382
  call get_pc.bp
383
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
384
376
  COL03 mm0, mm1, 0
385
  COL03 mm0, mm1, 0
377
  MUL_PACK mm0,mm1, [Up13], [Up31]
386
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
378
  movq mm4, mm0
387
  movq mm4, mm0
379
  movq mm5, mm1
388
  movq mm5, mm1
380
  STORE_ADD_1 mm4, mm5
389
  STORE_ADD_1 mm4, mm5
381
  add ecx, eax
390
  add ecx, eax
382
391
383
  COL03 mm2, mm3, 1
392
  COL03 mm2, mm3, 1
384
  MUL_PACK mm2,mm3, [Up13], [Up31]
393
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
385
  MIX_ROWS mm0, mm1, mm2, mm3
394
  MIX_ROWS mm0, mm1, mm2, mm3
386
  STORE_ADD_2 mm0, mm1
395
  STORE_ADD_2 mm0, mm1
387
396
388
  COL03 mm0, mm1, 2
397
  COL03 mm0, mm1, 2
389
  MUL_PACK mm0,mm1, [Up13], [Up31]
398
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
390
  MIX_ROWS mm2, mm3, mm0, mm1
399
  MIX_ROWS mm2, mm3, mm0, mm1
391
  STORE_ADD_2 mm2, mm3
400
  STORE_ADD_2 mm2, mm3
392
401
393
  COL03 mm2, mm3, 3
402
  COL03 mm2, mm3, 3
394
  MUL_PACK mm2,mm3, [Up13], [Up31]
403
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
395
  MIX_ROWS mm0, mm1, mm2, mm3
404
  MIX_ROWS mm0, mm1, mm2, mm3
396
  STORE_ADD_2 mm0, mm1
405
  STORE_ADD_2 mm0, mm1
397
406
398
  COL03 mm0, mm1, 4
407
  COL03 mm0, mm1, 4
399
  MUL_PACK mm0,mm1, [Up13], [Up31]
408
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
400
  MIX_ROWS mm2, mm3, mm0, mm1
409
  MIX_ROWS mm2, mm3, mm0, mm1
401
  STORE_ADD_2 mm2, mm3
410
  STORE_ADD_2 mm2, mm3
402
411
403
  COL03 mm2, mm3, 5
412
  COL03 mm2, mm3, 5
404
  MUL_PACK mm2,mm3, [Up13], [Up31]
413
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
405
  MIX_ROWS mm0, mm1, mm2, mm3
414
  MIX_ROWS mm0, mm1, mm2, mm3
406
  STORE_ADD_2 mm0, mm1
415
  STORE_ADD_2 mm0, mm1
407
416
408
  COL03 mm0, mm1, 6
417
  COL03 mm0, mm1, 6
409
  MUL_PACK mm0,mm1, [Up13], [Up31]
418
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
410
  MIX_ROWS mm2, mm3, mm0, mm1
419
  MIX_ROWS mm2, mm3, mm0, mm1
411
  STORE_ADD_2 mm2, mm3
420
  STORE_ADD_2 mm2, mm3
412
421
413
  COL03 mm2, mm3, 7
422
  COL03 mm2, mm3, 7
414
  MUL_PACK mm2,mm3, [Up13], [Up31]
423
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
415
  MIX_ROWS mm0, mm1, mm2, mm3
424
  MIX_ROWS mm0, mm1, mm2, mm3
416
  STORE_ADD_2 mm0, mm1
425
  STORE_ADD_2 mm0, mm1
417
426
418
  STORE_ADD_1 mm2, mm3
427
  STORE_ADD_1 mm2, mm3
419
428
420
429
421
  mov ecx, [esp+4]
430
  mov ecx, [esp+8]
422
  add ecx, 8
431
  add ecx, 8
423
432
424
  COL47 mm0, mm1, 0
433
  COL47 mm0, mm1, 0
425
  MUL_PACK mm0,mm1, [Up13], [Up31]
434
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
426
  movq mm4, mm0
435
  movq mm4, mm0
427
  movq mm5, mm1
436
  movq mm5, mm1
428
  STORE_ADD_1 mm4, mm5
437
  STORE_ADD_1 mm4, mm5
429
  add ecx, eax
438
  add ecx, eax
430
439
431
  COL47 mm2, mm3, 1
440
  COL47 mm2, mm3, 1
432
  MUL_PACK mm2,mm3, [Up13], [Up31]
441
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
433
  MIX_ROWS mm0, mm1, mm2, mm3
442
  MIX_ROWS mm0, mm1, mm2, mm3
434
  STORE_ADD_2 mm0, mm1
443
  STORE_ADD_2 mm0, mm1
435
444
436
  COL47 mm0, mm1, 2
445
  COL47 mm0, mm1, 2
437
  MUL_PACK mm0,mm1, [Up13], [Up31]
446
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
438
  MIX_ROWS mm2, mm3, mm0, mm1
447
  MIX_ROWS mm2, mm3, mm0, mm1
439
  STORE_ADD_2 mm2, mm3
448
  STORE_ADD_2 mm2, mm3
440
449
441
  COL47 mm2, mm3, 3
450
  COL47 mm2, mm3, 3
442
  MUL_PACK mm2,mm3, [Up13], [Up31]
451
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
443
  MIX_ROWS mm0, mm1, mm2, mm3
452
  MIX_ROWS mm0, mm1, mm2, mm3
444
  STORE_ADD_2 mm0, mm1
453
  STORE_ADD_2 mm0, mm1
445
454
446
  COL47 mm0, mm1, 4
455
  COL47 mm0, mm1, 4
447
  MUL_PACK mm0,mm1, [Up13], [Up31]
456
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
448
  MIX_ROWS mm2, mm3, mm0, mm1
457
  MIX_ROWS mm2, mm3, mm0, mm1
449
  STORE_ADD_2 mm2, mm3
458
  STORE_ADD_2 mm2, mm3
450
459
451
  COL47 mm2, mm3, 5
460
  COL47 mm2, mm3, 5
452
  MUL_PACK mm2,mm3, [Up13], [Up31]
461
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
453
  MIX_ROWS mm0, mm1, mm2, mm3
462
  MIX_ROWS mm0, mm1, mm2, mm3
454
  STORE_ADD_2 mm0, mm1
463
  STORE_ADD_2 mm0, mm1
455
464
456
  COL47 mm0, mm1, 6
465
  COL47 mm0, mm1, 6
457
  MUL_PACK mm0,mm1, [Up13], [Up31]
466
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
458
  MIX_ROWS mm2, mm3, mm0, mm1
467
  MIX_ROWS mm2, mm3, mm0, mm1
459
  STORE_ADD_2 mm2, mm3
468
  STORE_ADD_2 mm2, mm3
460
469
461
  COL47 mm2, mm3, 7
470
  COL47 mm2, mm3, 7
462
  MUL_PACK mm2,mm3, [Up13], [Up31]
471
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
463
  MIX_ROWS mm0, mm1, mm2, mm3
472
  MIX_ROWS mm0, mm1, mm2, mm3
464
  STORE_ADD_2 mm0, mm1
473
  STORE_ADD_2 mm0, mm1
465
474
466
  STORE_ADD_1 mm2, mm3
475
  STORE_ADD_1 mm2, mm3
467
476
477
  pop ebp
468
  ret
478
  ret
469
.endfunc
479
.endfunc
470
480
Lines 503-510 xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 Link Here
503
  mov edx, [esp+8]  ; Src
513
  mov edx, [esp+8]  ; Src
504
  mov eax, [esp+12] ; BpS
514
  mov eax, [esp+12] ; BpS
505
515
506
  movq mm6, [Up13]
516
  push ebp
507
  movq mm7, [Up31]
517
  call get_pc.bp
518
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
519
520
  movq mm6, [ebp + Up13 wrt ..gotoff]
521
  movq mm7, [ebp + Up31 wrt ..gotoff]
508
522
509
  COL03_SSE mm0, mm1, 0
523
  COL03_SSE mm0, mm1, 0
510
  MUL_PACK mm0,mm1, mm6, mm7
524
  MUL_PACK mm0,mm1, mm6, mm7
Lines 550-556 xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 Link Here
550
564
551
  STORE_1 mm2, mm3
565
  STORE_1 mm2, mm3
552
566
553
  mov ecx, [esp+4]
567
  mov ecx, [esp+8]
554
  add ecx, 8
568
  add ecx, 8
555
569
556
  COL47_SSE mm0, mm1, 0
570
  COL47_SSE mm0, mm1, 0
Lines 597-602 xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 Link Here
597
611
598
  STORE_1 mm2, mm3
612
  STORE_1 mm2, mm3
599
613
614
  pop ebp
600
  ret
615
  ret
601
.endfunc
616
.endfunc
602
617
Lines 614-711 xvid_Add_Upsampled_8x8_16To8_xmm: ; 549 Link Here
614
  mov edx, [esp+8]  ; Src
629
  mov edx, [esp+8]  ; Src
615
  mov eax, [esp+12] ; BpS
630
  mov eax, [esp+12] ; BpS
616
631
632
  push ebp
633
  call get_pc.bp
634
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
635
617
  COL03_SSE mm0, mm1, 0
636
  COL03_SSE mm0, mm1, 0
618
  MUL_PACK mm0,mm1, [Up13], [Up31]
637
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
619
  movq mm4, mm0
638
  movq mm4, mm0
620
  movq mm5, mm1
639
  movq mm5, mm1
621
  STORE_ADD_1 mm4, mm5
640
  STORE_ADD_1 mm4, mm5
622
  add ecx, eax
641
  add ecx, eax
623
642
624
  COL03_SSE mm2, mm3, 1
643
  COL03_SSE mm2, mm3, 1
625
  MUL_PACK mm2,mm3, [Up13], [Up31]
644
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
626
  MIX_ROWS mm0, mm1, mm2, mm3
645
  MIX_ROWS mm0, mm1, mm2, mm3
627
  STORE_ADD_2 mm0, mm1
646
  STORE_ADD_2 mm0, mm1
628
647
629
  COL03_SSE mm0, mm1, 2
648
  COL03_SSE mm0, mm1, 2
630
  MUL_PACK mm0,mm1, [Up13], [Up31]
649
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
631
  MIX_ROWS mm2, mm3, mm0, mm1
650
  MIX_ROWS mm2, mm3, mm0, mm1
632
  STORE_ADD_2 mm2, mm3
651
  STORE_ADD_2 mm2, mm3
633
652
634
  COL03_SSE mm2, mm3, 3
653
  COL03_SSE mm2, mm3, 3
635
  MUL_PACK mm2,mm3, [Up13], [Up31]
654
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
636
  MIX_ROWS mm0, mm1, mm2, mm3
655
  MIX_ROWS mm0, mm1, mm2, mm3
637
  STORE_ADD_2 mm0, mm1
656
  STORE_ADD_2 mm0, mm1
638
657
639
  COL03_SSE mm0, mm1, 4
658
  COL03_SSE mm0, mm1, 4
640
  MUL_PACK mm0,mm1, [Up13], [Up31]
659
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
641
  MIX_ROWS mm2, mm3, mm0, mm1
660
  MIX_ROWS mm2, mm3, mm0, mm1
642
  STORE_ADD_2 mm2, mm3
661
  STORE_ADD_2 mm2, mm3
643
662
644
  COL03_SSE mm2, mm3, 5
663
  COL03_SSE mm2, mm3, 5
645
  MUL_PACK mm2,mm3, [Up13], [Up31]
664
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
646
  MIX_ROWS mm0, mm1, mm2, mm3
665
  MIX_ROWS mm0, mm1, mm2, mm3
647
  STORE_ADD_2 mm0, mm1
666
  STORE_ADD_2 mm0, mm1
648
667
649
  COL03_SSE mm0, mm1, 6
668
  COL03_SSE mm0, mm1, 6
650
  MUL_PACK mm0,mm1, [Up13], [Up31]
669
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
651
  MIX_ROWS mm2, mm3, mm0, mm1
670
  MIX_ROWS mm2, mm3, mm0, mm1
652
  STORE_ADD_2 mm2, mm3
671
  STORE_ADD_2 mm2, mm3
653
672
654
  COL03_SSE mm2, mm3, 7
673
  COL03_SSE mm2, mm3, 7
655
  MUL_PACK mm2,mm3, [Up13], [Up31]
674
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
656
  MIX_ROWS mm0, mm1, mm2, mm3
675
  MIX_ROWS mm0, mm1, mm2, mm3
657
  STORE_ADD_2 mm0, mm1
676
  STORE_ADD_2 mm0, mm1
658
677
659
  STORE_ADD_1 mm2, mm3
678
  STORE_ADD_1 mm2, mm3
660
679
661
680
662
  mov ecx, [esp+4]
681
  mov ecx, [esp+8]
663
  add ecx, 8
682
  add ecx, 8
664
683
665
  COL47_SSE mm0, mm1, 0
684
  COL47_SSE mm0, mm1, 0
666
  MUL_PACK mm0,mm1, [Up13], [Up31]
685
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
667
  movq mm4, mm0
686
  movq mm4, mm0
668
  movq mm5, mm1
687
  movq mm5, mm1
669
  STORE_ADD_1 mm4, mm5
688
  STORE_ADD_1 mm4, mm5
670
  add ecx, eax
689
  add ecx, eax
671
690
672
  COL47_SSE mm2, mm3, 1
691
  COL47_SSE mm2, mm3, 1
673
  MUL_PACK mm2,mm3, [Up13], [Up31]
692
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
674
  MIX_ROWS mm0, mm1, mm2, mm3
693
  MIX_ROWS mm0, mm1, mm2, mm3
675
  STORE_ADD_2 mm0, mm1
694
  STORE_ADD_2 mm0, mm1
676
695
677
  COL47_SSE mm0, mm1, 2
696
  COL47_SSE mm0, mm1, 2
678
  MUL_PACK mm0,mm1, [Up13], [Up31]
697
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
679
  MIX_ROWS mm2, mm3, mm0, mm1
698
  MIX_ROWS mm2, mm3, mm0, mm1
680
  STORE_ADD_2 mm2, mm3
699
  STORE_ADD_2 mm2, mm3
681
700
682
  COL47_SSE mm2, mm3, 3
701
  COL47_SSE mm2, mm3, 3
683
  MUL_PACK mm2,mm3, [Up13], [Up31]
702
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
684
  MIX_ROWS mm0, mm1, mm2, mm3
703
  MIX_ROWS mm0, mm1, mm2, mm3
685
  STORE_ADD_2 mm0, mm1
704
  STORE_ADD_2 mm0, mm1
686
705
687
  COL47_SSE mm0, mm1, 4
706
  COL47_SSE mm0, mm1, 4
688
  MUL_PACK mm0,mm1, [Up13], [Up31]
707
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
689
  MIX_ROWS mm2, mm3, mm0, mm1
708
  MIX_ROWS mm2, mm3, mm0, mm1
690
  STORE_ADD_2 mm2, mm3
709
  STORE_ADD_2 mm2, mm3
691
710
692
  COL47_SSE mm2, mm3, 5
711
  COL47_SSE mm2, mm3, 5
693
  MUL_PACK mm2,mm3, [Up13], [Up31]
712
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
694
  MIX_ROWS mm0, mm1, mm2, mm3
713
  MIX_ROWS mm0, mm1, mm2, mm3
695
  STORE_ADD_2 mm0, mm1
714
  STORE_ADD_2 mm0, mm1
696
715
697
  COL47_SSE mm0, mm1, 6
716
  COL47_SSE mm0, mm1, 6
698
  MUL_PACK mm0,mm1, [Up13], [Up31]
717
  MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
699
  MIX_ROWS mm2, mm3, mm0, mm1
718
  MIX_ROWS mm2, mm3, mm0, mm1
700
  STORE_ADD_2 mm2, mm3
719
  STORE_ADD_2 mm2, mm3
701
720
702
  COL47_SSE mm2, mm3, 7
721
  COL47_SSE mm2, mm3, 7
703
  MUL_PACK mm2,mm3, [Up13], [Up31]
722
  MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff]
704
  MIX_ROWS mm0, mm1, mm2, mm3
723
  MIX_ROWS mm0, mm1, mm2, mm3
705
  STORE_ADD_2 mm0, mm1
724
  STORE_ADD_2 mm0, mm1
706
725
707
  STORE_ADD_1 mm2, mm3
726
  STORE_ADD_1 mm2, mm3
708
727
728
  pop ebp
709
  ret
729
  ret
710
.endfunc
730
.endfunc
711
731
Lines 732-738 xvid_HFilter_31_mmx: Link Here
732
  mov edi, [esp+8  +8]  ; Src2
752
  mov edi, [esp+8  +8]  ; Src2
733
  mov eax, [esp+12 +8] ; Nb_Blks
753
  mov eax, [esp+12 +8] ; Nb_Blks
734
  lea eax,[eax*2]
754
  lea eax,[eax*2]
735
  movq mm5, [Cst2]
755
  push dword 0x00020002
756
  push dword 0x00020002
757
  movq mm5, [esp]  ; Cst2
758
  add esp, byte 8
736
  pxor mm7, mm7
759
  pxor mm7, mm7
737
760
738
  lea esi, [esi+eax*4]
761
  lea esi, [esi+eax*4]
Lines 848-854 xvid_HFilter_31_x86: Link Here
848
;//////////////////////////////////////////////////////////////////////
871
;//////////////////////////////////////////////////////////////////////
849
872
850
%macro HFILTER_1331 2  ;%1:src %2:dst reg. -trashes mm0/mm1/mm2
873
%macro HFILTER_1331 2  ;%1:src %2:dst reg. -trashes mm0/mm1/mm2
851
  movq mm2, [Mask_ff]
874
  movq mm2, [ebp + Mask_ff wrt ..gotoff]
852
  movq %2,  [%1-1]    ;-10123456
875
  movq %2,  [%1-1]    ;-10123456
853
  movq mm0, [%1]      ; 01234567
876
  movq mm0, [%1]      ; 01234567
854
  movq mm1, [%1+1]    ; 12345678
877
  movq mm1, [%1+1]    ; 12345678
Lines 863-869 xvid_HFilter_31_x86: Link Here
863
%endmacro
886
%endmacro
864
887
865
%macro VFILTER_1331 4  ; %1-4: regs  %1-%2: trashed
888
%macro VFILTER_1331 4  ; %1-4: regs  %1-%2: trashed
866
  paddsw %1, [Cst32]
889
  paddsw %1, [ebp + Cst32 wrt ..gotoff]
867
  paddsw %2, %3
890
  paddsw %2, %3
868
  pmullw %2, mm7
891
  pmullw %2, mm7
869
  paddsw %1,%4
892
  paddsw %1,%4
Lines 899-905 xvid_Filter_18x18_To_8x8_mmx: ; 283c Link Here
899
  mov edx, [esp+8]  ; Src
922
  mov edx, [esp+8]  ; Src
900
  mov eax, [esp+12] ; BpS
923
  mov eax, [esp+12] ; BpS
901
924
902
  movq mm7, [Cst3]
925
  push ebp
926
  call get_pc.bp
927
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
928
929
  movq mm7, [ebp + Cst3 wrt ..gotoff]
903
  sub edx, eax
930
  sub edx, eax
904
931
905
    ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
932
    ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
Lines 917-923 xvid_Filter_18x18_To_8x8_mmx: ; 283c Link Here
917
944
918
      ; process columns 4-7
945
      ; process columns 4-7
919
946
920
  mov edx, [esp+8]
947
  mov edx, [esp+12]
921
  sub edx, eax
948
  sub edx, eax
922
  add edx, 8
949
  add edx, 8
923
950
Lines 930-935 xvid_Filter_18x18_To_8x8_mmx: ; 283c Link Here
930
  COPY_TWO_LINES_1331 ecx + 4*16 +8
957
  COPY_TWO_LINES_1331 ecx + 4*16 +8
931
  COPY_TWO_LINES_1331 ecx + 6*16 +8
958
  COPY_TWO_LINES_1331 ecx + 6*16 +8
932
959
960
  pop ebp
933
  ret
961
  ret
934
.endfunc
962
.endfunc
935
963
Lines 958-963 xvid_Filter_18x18_To_8x8_mmx: ; 283c Link Here
958
  movq [%1+16], mm2
986
  movq [%1+16], mm2
959
%endmacro
987
%endmacro
960
988
989
extern  _GLOBAL_OFFSET_TABLE_
990
get_pc.bp:
991
  mov ebp, [esp]
992
  retn
993
961
align 16
994
align 16
962
xvid_Filter_Diff_18x18_To_8x8_mmx:  ; 302c
995
xvid_Filter_Diff_18x18_To_8x8_mmx:  ; 302c
963
996
Lines 965-971 xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 Link Here
965
  mov edx, [esp+8]  ; Src
998
  mov edx, [esp+8]  ; Src
966
  mov eax, [esp+12] ; BpS
999
  mov eax, [esp+12] ; BpS
967
1000
968
  movq mm7, [Cst3]
1001
  push ebp
1002
  call get_pc.bp
1003
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
1004
1005
  movq mm7, [ebp + Cst3 wrt ..gotoff]
969
  sub edx, eax
1006
  sub edx, eax
970
1007
971
    ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
1008
    ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
Lines 982-988 xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 Link Here
982
  DIFF_TWO_LINES_1331 ecx + 6*16
1019
  DIFF_TWO_LINES_1331 ecx + 6*16
983
1020
984
      ; process columns 4-7
1021
      ; process columns 4-7
985
  mov edx, [esp+8]
1022
  mov edx, [esp+12]
986
  sub edx, eax
1023
  sub edx, eax
987
  add edx, 8
1024
  add edx, 8
988
1025
Lines 995-1000 xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 Link Here
995
  DIFF_TWO_LINES_1331 ecx + 4*16 +8
1032
  DIFF_TWO_LINES_1331 ecx + 4*16 +8
996
  DIFF_TWO_LINES_1331 ecx + 6*16 +8
1033
  DIFF_TWO_LINES_1331 ecx + 6*16 +8
997
1034
1035
  pop ebp
998
  ret
1036
  ret
999
.endfunc
1037
.endfunc
1000
1038
(-)xvidcore-1.1.3-old/src/motion/x86_asm/sad_3dn.asm (-16 / +8 lines)
Lines 44-63 BITS 32 Link Here
44
%endmacro
44
%endmacro
45
45
46
;=============================================================================
46
;=============================================================================
47
; Read only data
48
;=============================================================================
49
50
%ifdef FORMAT_COFF
51
SECTION .rodata
52
%else
53
SECTION .rodata align=16
54
%endif
55
56
ALIGN 16
57
mmx_one:
58
	times 4	dw 1
59
60
;=============================================================================
61
; Helper macros
47
; Helper macros
62
;=============================================================================
48
;=============================================================================
63
%macro SADBI_16x16_3DN 0
49
%macro SADBI_16x16_3DN 0
Lines 179-185 sad16bi_3dn: Link Here
179
  SADBI_16x16_3DN
165
  SADBI_16x16_3DN
180
  SADBI_16x16_3DN
166
  SADBI_16x16_3DN
181
167
182
  pmaddwd mm6, [mmx_one] ; collapse
168
  push dword 0x00010001
169
  push dword 0x00010001
170
  pmaddwd mm6, [esp] ; collapse
171
  add esp, byte 8
183
  movq mm7, mm6
172
  movq mm7, mm6
184
  psrlq mm7, 32
173
  psrlq mm7, 32
185
  paddd mm6, mm7
174
  paddd mm6, mm7
Lines 216-222 sad8bi_3dn: Link Here
216
  SADBI_8x8_3DN
205
  SADBI_8x8_3DN
217
  SADBI_8x8_3DN
206
  SADBI_8x8_3DN
218
207
219
  pmaddwd mm6, [mmx_one] ; collapse
208
  push dword 0x00010001
209
  push dword 0x00010001
210
  pmaddwd mm6, [esp] ; collapse
211
  add esp, byte 8
220
  movq mm7, mm6
212
  movq mm7, mm6
221
  psrlq mm7, 32
213
  psrlq mm7, 32
222
  paddd mm6, mm7
214
  paddd mm6, mm7
(-)xvidcore-1.1.3-old/src/motion/x86_asm/sad_mmx.asm (-27 / +32 lines)
Lines 45-64 BITS 32 Link Here
45
%endmacro
45
%endmacro
46
46
47
;=============================================================================
47
;=============================================================================
48
; Read only data
49
;=============================================================================
50
51
%ifdef FORMAT_COFF
52
SECTION .rodata
53
%else
54
SECTION .rodata align=16
55
%endif
56
57
ALIGN 16
58
mmx_one:
59
	times 4	dw 1
60
61
;=============================================================================
62
; Helper macros
48
; Helper macros
63
;=============================================================================
49
;=============================================================================
64
50
Lines 181-188 mmx_one: Link Here
181
167
182
  paddusw mm0, mm2              ; mm01 = ref1 + ref2
168
  paddusw mm0, mm2              ; mm01 = ref1 + ref2
183
  paddusw mm1, mm3
169
  paddusw mm1, mm3
184
  paddusw mm0, [mmx_one]        ; mm01 += 1
170
  paddusw mm0, [esp]            ; mm01 += 1
185
  paddusw mm1, [mmx_one]
171
  paddusw mm1, [esp]
186
  psrlw mm0, 1                  ; mm01 >>= 1
172
  psrlw mm0, 1                  ; mm01 >>= 1
187
  psrlw mm1, 1
173
  psrlw mm1, 1
188
174
Lines 314-320 sad16_mmx: Link Here
314
  SAD_16x16_MMX
300
  SAD_16x16_MMX
315
  SAD_16x16_MMX
301
  SAD_16x16_MMX
316
302
317
  pmaddwd mm6, [mmx_one] ; collapse
303
  pmaddwd mm6, [esp] ; collapse
318
  movq mm7, mm6
304
  movq mm7, mm6
319
  psrlq mm7, 32
305
  psrlq mm7, 32
320
  paddd mm6, mm7
306
  paddd mm6, mm7
Lines 339-344 sad8_mmx: Link Here
339
  mov edx, [esp+ 8] ; Src2
325
  mov edx, [esp+ 8] ; Src2
340
  mov ecx, [esp+12] ; Stride
326
  mov ecx, [esp+12] ; Stride
341
327
328
  push dword 0x00010001
329
  push dword 0x00010001
330
342
  pxor mm6, mm6 ; accum
331
  pxor mm6, mm6 ; accum
343
  pxor mm7, mm7 ; zero
332
  pxor mm7, mm7 ; zero
344
333
Lines 347-359 sad8_mmx: Link Here
347
  SAD_8x8_MMX
336
  SAD_8x8_MMX
348
  SAD_8x8_MMX
337
  SAD_8x8_MMX
349
338
350
  pmaddwd mm6, [mmx_one] ; collapse
339
  pmaddwd mm6, [esp] ; collapse
351
  movq mm7, mm6
340
  movq mm7, mm6
352
  psrlq mm7, 32
341
  psrlq mm7, 32
353
  paddd mm6, mm7
342
  paddd mm6, mm7
354
343
355
  movd eax, mm6
344
  movd eax, mm6
356
345
  add esp, byte 8
357
  ret
346
  ret
358
.endfunc
347
.endfunc
359
348
Lines 377-382 sad16v_mmx: Link Here
377
  mov ecx, [esp + 8 + 12] ; Stride
366
  mov ecx, [esp + 8 + 12] ; Stride
378
  mov ebx, [esp + 8 + 16] ; sad ptr
367
  mov ebx, [esp + 8 + 16] ; sad ptr
379
368
369
  push dword 0x00010001
370
  push dword 0x00010001
371
380
  pxor mm5, mm5 ; accum
372
  pxor mm5, mm5 ; accum
381
  pxor mm6, mm6 ; accum
373
  pxor mm6, mm6 ; accum
382
  pxor mm7, mm7 ; zero
374
  pxor mm7, mm7 ; zero
Lines 390-397 sad16v_mmx: Link Here
390
  SADV_16x16_MMX
382
  SADV_16x16_MMX
391
  SADV_16x16_MMX
383
  SADV_16x16_MMX
392
384
393
  pmaddwd mm5, [mmx_one] ; collapse
385
  pmaddwd mm5, [esp] ; collapse
394
  pmaddwd mm6, [mmx_one] ; collapse
386
  pmaddwd mm6, [esp] ; collapse
395
387
396
  movq mm2, mm5
388
  movq mm2, mm5
397
  movq mm3, mm6
389
  movq mm3, mm6
Lines 421-428 sad16v_mmx: Link Here
421
  SADV_16x16_MMX
413
  SADV_16x16_MMX
422
  SADV_16x16_MMX
414
  SADV_16x16_MMX
423
415
424
  pmaddwd mm5, [mmx_one] ; collapse
416
  pmaddwd mm5, [esp] ; collapse
425
  pmaddwd mm6, [mmx_one] ; collapse
417
  pmaddwd mm6, [esp] ; collapse
426
418
427
  movq mm2, mm5
419
  movq mm2, mm5
428
  movq mm3, mm6
420
  movq mm3, mm6
Lines 442-447 sad16v_mmx: Link Here
442
434
443
  add eax, edi
435
  add eax, edi
444
436
437
  add esp, byte 8
445
  pop edi
438
  pop edi
446
  pop ebx
439
  pop ebx
447
440
Lines 465-470 sad16bi_mmx: Link Here
465
  mov ebx, [esp+4+12] ; Ref2
458
  mov ebx, [esp+4+12] ; Ref2
466
  mov ecx, [esp+4+16] ; Stride
459
  mov ecx, [esp+4+16] ; Stride
467
460
461
  push dword 0x00010001
462
  push dword 0x00010001
463
468
  pxor mm6, mm6 ; accum2
464
  pxor mm6, mm6 ; accum2
469
  pxor mm7, mm7
465
  pxor mm7, mm7
470
.Loop
466
.Loop
Lines 502-513 sad16bi_mmx: Link Here
502
  SADBI_16x16_MMX 0, 0
498
  SADBI_16x16_MMX 0, 0
503
  SADBI_16x16_MMX 8, 1
499
  SADBI_16x16_MMX 8, 1
504
500
505
  pmaddwd mm6, [mmx_one] ; collapse
501
  pmaddwd mm6, [esp] ; collapse
506
  movq mm7, mm6
502
  movq mm7, mm6
507
  psrlq mm7, 32
503
  psrlq mm7, 32
508
  paddd mm6, mm7
504
  paddd mm6, mm7
509
505
510
  movd eax, mm6
506
  movd eax, mm6
507
  add esp, byte 8
511
  pop ebx
508
  pop ebx
512
509
513
  ret
510
  ret
Lines 530-535 sad8bi_mmx: Link Here
530
  mov ebx, [esp+4+12] ; Ref2
527
  mov ebx, [esp+4+12] ; Ref2
531
  mov ecx, [esp+4+16] ; Stride
528
  mov ecx, [esp+4+16] ; Stride
532
529
530
  push dword 0x00010001
531
  push dword 0x00010001
532
533
  pxor mm6, mm6 ; accum2
533
  pxor mm6, mm6 ; accum2
534
  pxor mm7, mm7
534
  pxor mm7, mm7
535
.Loop
535
.Loop
Lines 542-553 sad8bi_mmx: Link Here
542
  SADBI_16x16_MMX 0, 1
542
  SADBI_16x16_MMX 0, 1
543
  SADBI_16x16_MMX 0, 1
543
  SADBI_16x16_MMX 0, 1
544
544
545
  pmaddwd mm6, [mmx_one] ; collapse
545
  pmaddwd mm6, [esp] ; collapse
546
  movq mm7, mm6
546
  movq mm7, mm6
547
  psrlq mm7, 32
547
  psrlq mm7, 32
548
  paddd mm6, mm7
548
  paddd mm6, mm7
549
549
550
  movd eax, mm6
550
  movd eax, mm6
551
  add esp, byte 8
551
  pop ebx
552
  pop ebx
552
  ret
553
  ret
553
.endfunc
554
.endfunc
Lines 568-573 dev16_mmx: Link Here
568
  pxor mm5, mm5 ; accum1
569
  pxor mm5, mm5 ; accum1
569
  pxor mm6, mm6 ; accum2
570
  pxor mm6, mm6 ; accum2
570
571
572
  push dword 0x00010001
573
  push dword 0x00010001
574
571
  MEAN_16x16_MMX
575
  MEAN_16x16_MMX
572
  MEAN_16x16_MMX
576
  MEAN_16x16_MMX
573
  MEAN_16x16_MMX
577
  MEAN_16x16_MMX
Lines 587-593 dev16_mmx: Link Here
587
  MEAN_16x16_MMX
591
  MEAN_16x16_MMX
588
592
589
  paddusw mm6, mm5
593
  paddusw mm6, mm5
590
  pmaddwd mm6, [mmx_one]    ; collapse
594
  pmaddwd mm6, [esp]        ; collapse
591
  movq mm5, mm6
595
  movq mm5, mm6
592
  psrlq mm5, 32
596
  psrlq mm5, 32
593
  paddd mm6, mm5
597
  paddd mm6, mm5
Lines 622-634 dev16_mmx: Link Here
622
  ABS_16x16_MMX
626
  ABS_16x16_MMX
623
  ABS_16x16_MMX
627
  ABS_16x16_MMX
624
628
625
  pmaddwd mm5, [mmx_one]    ; collapse
629
  pmaddwd mm5, [esp]        ; collapse
626
  movq mm6, mm5
630
  movq mm6, mm5
627
  psrlq mm6, 32
631
  psrlq mm6, 32
628
  paddd mm6, mm5
632
  paddd mm6, mm5
629
633
630
  movd eax, mm6
634
  movd eax, mm6
631
635
636
  add esp, byte 8
632
  ret
637
  ret
633
.endfunc
638
.endfunc
634
639
(-)xvidcore-1.1.3-old/src/quant/x86_asm/quantize_h263_3dne.asm (-52 / +81 lines)
Lines 233-239 ALIGN 8 Link Here
233
  movq mm3, [ebx]   ;B2
233
  movq mm3, [ebx]   ;B2
234
%endif
234
%endif
235
%if (%1 == 3)
235
%if (%1 == 3)
236
  imul eax, [int_div+4*edi]
236
  mov esi, [esp + 4]
237
  imul eax, [esi + int_div+4*edi wrt ..gotoff]
237
%endif
238
%endif
238
  pxor mm5, mm4 ;C7
239
  pxor mm5, mm4 ;C7
239
  pxor mm7, mm6 ;D7
240
  pxor mm7, mm6 ;D7
Lines 313-319 ALIGN 8 Link Here
313
%endif
314
%endif
314
  nop
315
  nop
315
%if (%1 == 3)
316
%if (%1 == 3)
316
  imul eax, [int_div+4*edi]
317
  mov esi, [esp +4]
318
  imul eax, [esi + int_div+4*edi wrt ..gotoff]
317
%endif
319
%endif
318
  pxor mm5, mm4 ;C7
320
  pxor mm5, mm4 ;C7
319
  pxor mm7, mm6 ;D7
321
  pxor mm7, mm6 ;D7
Lines 327-349 quant_h263_intra_3dne: Link Here
327
  mov eax, [esp + 12]       ; quant
329
  mov eax, [esp + 12]       ; quant
328
  mov ecx, [esp + 8]        ; data
330
  mov ecx, [esp + 8]        ; data
329
  mov edx, [esp + 4]        ; coeff
331
  mov edx, [esp + 4]        ; coeff
332
  push esi
333
  push ebx
334
  push edi
335
  call get_pc.si
336
  add esi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
337
  push esi
338
330
  cmp al, 1
339
  cmp al, 1
331
  pxor mm1, mm1
340
  pxor mm1, mm1
332
  pxor mm3, mm3
341
  pxor mm3, mm3
333
  movq mm0, [ecx]           ; mm0 = [1st]
342
  movq mm0, [ecx]           ; mm0 = [1st]
334
  movq mm2, [ecx + 8]
343
  movq mm2, [ecx + 8]
335
  push esi
344
  lea ebx, [esi + mmzero wrt ..gotoff]
336
  lea esi, [mmx_div + eax*8 - 8]
345
  lea esi, [esi + mmx_div + eax*8 - 8 wrt ..gotoff]
337
346
338
  push ebx
339
  mov ebx, mmzero
340
  push edi
341
  jz near .q1loop
347
  jz near .q1loop
342
348
343
  quant_intra 0
349
  quant_intra 0
344
  mov ebp, [esp + 16 + 16]      ; dcscalar
350
  mov ebp, [esp + 20 + 16]      ; dcscalar
345
                                ; NB -- there are 3 pushes in the function preambule and one more
351
                                ; NB -- there are 4 pushes in the function preambule and one more
346
                                ; in "quant_intra 0", thus an added offset of 16 bytes
352
                                ; in "quant_intra 0", thus an added offset of 20 bytes
347
  movsx eax, word [byte ecx]    ; DC
353
  movsx eax, word [byte ecx]    ; DC
348
354
349
  quant_intra 1
355
  quant_intra 1
Lines 354-373 quant_h263_intra_3dne: Link Here
354
  quant_intra 2
360
  quant_intra 2
355
  sub eax, edi                      ; DC (+1)
361
  sub eax, edi                      ; DC (+1)
356
  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
362
  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
357
  mov edi, [esp + 16 + 16]          ; dscalar
363
  mov edi, [esp + 20 + 16]          ; dscalar
358
  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar/2
364
  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar/2
359
  mov ebp, [byte esp]
365
  mov ebp, [byte esp]
360
366
361
  quant_intra 3
367
  quant_intra 3
362
  psubw mm5, mm4                    ;C8
368
  psubw mm5, mm4                    ;C8
363
  mov esi, [esp + 12]               ; pop back the register value
369
  mov esi, [esp + 16]               ; pop back the register value
364
  mov edi, [esp + 4]                ; pop back the register value
370
  mov edi, [esp + 8]                ; pop back the register value
365
  sar eax, 16
371
  sar eax, 16
366
  lea ebx, [byte eax + 1]           ; workaround for eax < 0
372
  lea ebx, [byte eax + 1]           ; workaround for eax < 0
367
  cmovs eax, ebx                    ; conditionnaly move the corrected value
373
  cmovs eax, ebx                    ; conditionnaly move the corrected value
368
  mov [edx], ax                     ; coeff[0] = ax
374
  mov [edx], ax                     ; coeff[0] = ax
369
  mov ebx, [esp + 8]                ; pop back the register value
375
  mov ebx, [esp + 12]               ; pop back the register value
370
  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
376
  add esp, byte 20                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 20
371
  psubw mm7, mm6                    ;D8
377
  psubw mm7, mm6                    ;D8
372
  movq [edx + 3 * 32 + 16], mm5     ;C9
378
  movq [edx + 3 * 32 + 16], mm5     ;C9
373
  movq [edx + 3 * 32 + 24], mm7     ;D9
379
  movq [edx + 3 * 32 + 24], mm7     ;D9
Lines 379-385 ALIGN 16 Link Here
379
385
380
.q1loop
386
.q1loop
381
  quant_intra1 0
387
  quant_intra1 0
382
  mov ebp, [esp + 16 + 16]          ; dcscalar
388
  mov ebp, [esp + 20 + 16]          ; dcscalar
383
  movsx eax, word [byte ecx]        ; DC
389
  movsx eax, word [byte ecx]        ; DC
384
390
385
  quant_intra1 1
391
  quant_intra1 1
Lines 390-409 ALIGN 16 Link Here
390
  quant_intra1 2
396
  quant_intra1 2
391
  sub eax, edi                      ; DC (+1)
397
  sub eax, edi                      ; DC (+1)
392
  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
398
  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
393
  mov edi, [esp + 16 + 16]          ; dcscalar
399
  mov edi, [esp + 20 + 16]          ; dcscalar
394
  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar /2
400
  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar /2
395
  mov ebp, [byte esp]
401
  mov ebp, [byte esp]
396
402
397
  quant_intra1 3
403
  quant_intra1 3
398
  psubw mm5, mm4                    ;C8
404
  psubw mm5, mm4                    ;C8
399
  mov esi, [dword esp + 12]         ; pop back the register value
405
  mov esi, [dword esp + 16]         ; pop back the register value
400
  mov edi, [esp + 4]                ; pop back the register value
406
  mov edi, [esp + 8]                ; pop back the register value
401
  sar eax, 16
407
  sar eax, 16
402
  lea ebx, [byte eax + 1]           ; workaround for eax < 0
408
  lea ebx, [byte eax + 1]           ; workaround for eax < 0
403
  cmovs eax, ebx                    ; conditionnaly move the corrected value
409
  cmovs eax, ebx                    ; conditionnaly move the corrected value
404
  mov [edx], ax                     ; coeff[0] = ax
410
  mov [edx], ax                     ; coeff[0] = ax
405
  mov ebx, [esp + 8]                ; pop back the register value
411
  mov ebx, [esp + 12]               ; pop back the register value
406
  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
412
  add esp, byte 20                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 20
407
  psubw mm7, mm6                    ;D8
413
  psubw mm7, mm6                    ;D8
408
  movq [edx + 3 * 32 + 16], mm5     ;C9
414
  movq [edx + 3 * 32 + 16], mm5     ;C9
409
  movq [edx + 3 * 32 + 24], mm7     ;D9
415
  movq [edx + 3 * 32 + 24], mm7     ;D9
Lines 441-447 ALIGN 16 Link Here
441
%if (%1)
447
%if (%1)
442
  movq [edx + %1*24+16-24], mm2 ;C11
448
  movq [edx + %1*24+16-24], mm2 ;C11
443
%endif
449
%endif
444
  psubusw mm1, [ebx]            ;A5 mm0 -= sub (unsigned, dont go < 0)
450
  psubusw mm1, [esi]            ;A5 mm0 -= sub (unsigned, dont go < 0)
445
  pxor mm4, mm3                 ;B9
451
  pxor mm4, mm3                 ;B9
446
  movq mm2, [eax]               ;C2
452
  movq mm2, [eax]               ;C2
447
  psraw mm0, 15                 ;A6
453
  psraw mm0, 15                 ;A6
Lines 456-462 ALIGN 16 Link Here
456
%else
462
%else
457
  movq [edx + 120], mm4         ;B11
463
  movq [edx + 120], mm4         ;B11
458
%endif
464
%endif
459
  psubusw mm2, [ebx]            ;C5
465
  psubusw mm2, [esi]            ;C5
460
  pxor mm1, mm0                 ;A9 mm0 *= sign(mm0)
466
  pxor mm1, mm0                 ;A9 mm0 *= sign(mm0)
461
  movq mm4, [eax]               ;B2
467
  movq mm4, [eax]               ;B2
462
  psraw mm6, 15                 ;C6
468
  psraw mm6, 15                 ;C6
Lines 467-473 ALIGN 16 Link Here
467
  pmaxsw mm4, mm3               ;B4
473
  pmaxsw mm4, mm3               ;B4
468
  paddw mm5, mm2                ;C8
474
  paddw mm5, mm2                ;C8
469
  movq [byte edx + %1*24], mm1  ;A11
475
  movq [byte edx + %1*24], mm1  ;A11
470
  psubusw mm4, [ebx]            ;B5
476
  psubusw mm4, [esi]            ;B5
471
  pxor mm2, mm6                 ;C9
477
  pxor mm2, mm6                 ;C9
472
%endmacro
478
%endmacro
473
479
Lines 503-517 quant_h263_inter_3dne: Link Here
503
  mov edx, [esp  + 4]               ; coeff
509
  mov edx, [esp  + 4]               ; coeff
504
  mov ecx, [esp  + 8]               ; data
510
  mov ecx, [esp  + 8]               ; data
505
  mov eax, [esp  + 12]              ; quant
511
  mov eax, [esp  + 12]              ; quant
506
  push ebx
512
  push esi
513
514
  call get_pc.si
515
  add esi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
516
  push esi
507
517
508
  pxor mm5, mm5                     ; sum
518
  pxor mm5, mm5                     ; sum
509
  nop
519
  nop
510
  lea ebx,[mmx_sub + eax * 8 - 8]   ; sub
520
  movq mm7, [esi + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider
511
  movq mm7, [mmx_div + eax * 8 - 8] ; divider
521
  lea esi,[esi + mmx_sub + eax * 8 - 8 wrt ..gotoff]   ; sub
512
522
513
  cmp al, 1
523
  cmp al, 1
514
  lea eax, [mmzero]
524
  mov eax, [esp]
525
  lea eax, [eax + mmzero wrt ..gotoff]
515
  jz near .q1loop
526
  jz near .q1loop
516
  cmp esp, esp
527
  cmp esp, esp
517
ALIGN 8
528
ALIGN 8
Lines 520-526 ALIGN 8 Link Here
520
  psubw mm4, mm3            ;B3
531
  psubw mm4, mm3            ;B3
521
  movq mm0, [ecx]           ;A1 mm0 = [1st]
532
  movq mm0, [ecx]           ;A1 mm0 = [1st]
522
  pmaxsw mm4, mm3           ;B4
533
  pmaxsw mm4, mm3           ;B4
523
  psubusw mm4, [ebx]        ;B5
534
  psubusw mm4, [esi]        ;B5
524
535
525
  quantinter 0
536
  quantinter 0
526
  quantinter 1
537
  quantinter 1
Lines 535-553 ALIGN 8 Link Here
535
  pxor mm4, mm3             ;B9
546
  pxor mm4, mm3             ;B9
536
  psubw mm4, mm3            ;B10
547
  psubw mm4, mm3            ;B10
537
  movq [edx + 4*24+16], mm2 ;C11
548
  movq [edx + 4*24+16], mm2 ;C11
538
  pop ebx
539
  movq [edx + 4*24+8], mm4  ;B11
549
  movq [edx + 4*24+8], mm4  ;B11
540
  pmaddwd mm5, [plus_one]
550
  pop esi
551
  pmaddwd mm5, [esi + plus_one wrt ..gotoff]
541
  movq mm0, mm5
552
  movq mm0, mm5
542
  punpckhdq mm5, mm5
553
  punpckhdq mm5, mm5
543
  paddd mm0, mm5
554
  paddd mm0, mm5
544
  movd eax, mm0             ; return sum
555
  movd eax, mm0             ; return sum
545
556
557
  pop esi
546
  ret
558
  ret
547
559
548
ALIGN 16
560
ALIGN 16
549
.q1loop
561
.q1loop
550
  movq mm6, [byte ebx]
562
  movq mm6, [byte esi]
551
563
552
  quantinter1 0
564
  quantinter1 0
553
  quantinter1 1
565
  quantinter1 1
Lines 558-570 ALIGN 16 Link Here
558
  quantinter1 6
570
  quantinter1 6
559
  quantinter1 7
571
  quantinter1 7
560
572
561
  pmaddwd mm5, [plus_one]
573
  pop esi
574
  pmaddwd mm5, [esi + plus_one wrt ..gotoff]
562
  movq mm0, mm5
575
  movq mm0, mm5
563
  psrlq mm5, 32
576
  psrlq mm5, 32
564
  paddd mm0, mm5
577
  paddd mm0, mm5
565
  movd eax, mm0 ; return sum
578
  movd eax, mm0 ; return sum
566
579
567
  pop ebx
580
  pop esi
568
581
569
  ret
582
  ret
570
.endfunc
583
.endfunc
Lines 658-680 dequant_h263_intra_3dne: Link Here
658
  pxor mm2, mm2
671
  pxor mm2, mm2
659
  push edi
672
  push edi
660
  push ebx
673
  push ebx
661
  lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
662
  push ebp
674
  push ebp
663
  mov ebx, mmx_2047
664
  movsx ebp, word [ecx]
665
  lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
666
  push esi
675
  push esi
667
  mov esi, mmzero
676
677
  call get_pc.si
678
  add esi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
679
  push esi
680
681
  lea edi, [esi + mmx_mul + eax*8 - 8 wrt ..gotoff]    ; 2*quant
682
  lea ebx, [esi + mmx_2047 wrt ..gotoff]
683
  movsx ebp, word [ecx]
684
  lea eax, [esi + mmx_add + eax*8 - 8 wrt ..gotoff]    ; quant or quant-1
685
  lea esi, [esi + mmzero wrt ..gotoff]
668
  pxor mm7, mm7
686
  pxor mm7, mm7
669
  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
687
  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
670
  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
688
  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
671
689
672
  imul ebp, [esp+16+16]             ; dcscalar
690
  imul ebp, [esp+16+20]             ; dcscalar
673
  psubw mm2, mm3                    ;-c         ;B3 (1st dep)
691
  psubw mm2, mm3                    ;-c         ;B3 (1st dep)
674
  pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
692
  pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
675
  pmullw mm2, [edi]                 ;*= 2Q  ;B8 (3rd+)
693
  pmullw mm2, [edi]                 ;*= 2Q  ;B8 (3rd+)
676
  psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
694
  psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
677
  mov edx, [esp+ 4+16]              ; data
695
  mov edx, [esp+ 4+20]              ; data
678
696
679
ALIGN 8
697
ALIGN 8
680
  dequant 0
698
  dequant 0
Lines 684-690 ALIGN 8 Link Here
684
702
685
  dequant 1
703
  dequant 1
686
704
687
  cmovl ebp, [int_2048]
705
  mov ebp, [esp]
706
  cmovl ebp, [ebp + int_2048 wrt ..gotoff]
688
  nop
707
  nop
689
708
690
  dequant 2
709
  dequant 2
Lines 694-700 ALIGN 8 Link Here
694
713
695
  dequant 3
714
  dequant 3
696
715
697
  cmovg ebp, [int2047]
716
  mov ebp, [esp]
717
  cmovg ebp, [ebp + int2047 wrt ..gotoff]
698
  nop
718
  nop
699
719
700
  dequant 4
720
  dequant 4
Lines 703-718 ALIGN 8 Link Here
703
  pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
723
  pminsw mm4, [ebx]         ;C12 saturates to +2047 (5th+)
704
  pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
724
  pandn mm7, [eax]          ;B9 offset = isZero ? 0 : quant_add (2nd)
705
  mov eax, ebp
725
  mov eax, ebp
706
  mov esi, [esp]
726
  mov esi, [esp+4]
707
  mov ebp, [esp+4]
727
  mov ebp, [esp+8]
708
  pxor mm5, mm4             ;C13 (6th+)
728
  pxor mm5, mm4             ;C13 (6th+)
709
  paddw mm7, mm3            ;B10  offset +negate back (3rd)
729
  paddw mm7, mm3            ;B10  offset +negate back (3rd)
710
  movq [edx+4*24+16], mm5   ;C14 (7th)
730
  movq [edx+4*24+16], mm5   ;C14 (7th)
711
  paddw mm2, mm7            ;B11 mm7 free (4th+)
731
  paddw mm2, mm7            ;B11 mm7 free (4th+)
712
  pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
732
  pminsw mm2, [ebx]         ;B12 saturates to +2047 (5th+)
713
  mov ebx, [esp+8]
733
  mov ebx, [esp+12]
714
  mov edi, [esp+12]
734
  mov edi, [esp+16]
715
  add esp, byte 16
735
  add esp, byte 20
716
  pxor mm3, mm2             ;B13 (6th+)
736
  pxor mm3, mm2             ;B13 (6th+)
717
  movq [edx+4*24+8], mm3    ;B14 (7th)
737
  movq [edx+4*24+8], mm3    ;B14 (7th)
718
  mov [edx], ax
738
  mov [edx], ax
Lines 721-726 ALIGN 8 Link Here
721
  ret
741
  ret
722
.endfunc
742
.endfunc
723
743
744
extern  _GLOBAL_OFFSET_TABLE_
745
get_pc.si:
746
  mov esi, [esp]
747
  retn
748
724
;-----------------------------------------------------------------------------
749
;-----------------------------------------------------------------------------
725
;
750
;
726
; uint32_t dequant_h263_inter_3dne(int16_t * data,
751
; uint32_t dequant_h263_inter_3dne(int16_t * data,
Lines 744-757 dequant_h263_inter_3dne: Link Here
744
  push edi
769
  push edi
745
  push ebx
770
  push ebx
746
  push esi
771
  push esi
747
  lea edi, [mmx_mul + eax*8 - 8]    ; 2*quant
772
748
  mov ebx, mmx_2047
773
  call get_pc.si
774
  add esi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
775
776
  lea edi, [esi + mmx_mul + eax*8 - 8 wrt ..gotoff]    ; 2*quant
777
  lea ebx, [esi + mmx_2047 wrt ..gotoff]
749
  pxor mm7, mm7
778
  pxor mm7, mm7
750
  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
779
  movq mm3, [ecx+120]               ;B2 ; c  = coeff[i]
751
  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
780
  pcmpeqw mm7, [ecx+120]            ;B6 (c ==0) ? -1 : 0 (1st)
752
  lea eax, [mmx_add + eax*8 - 8]    ; quant or quant-1
781
  lea eax, [esi + mmx_add + eax*8 - 8 wrt ..gotoff]    ; quant or quant-1
753
  psubw mm2, mm3                    ;-c ;B3 (1st dep)
782
  psubw mm2, mm3                    ;-c ;B3 (1st dep)
754
  mov esi, mmzero
783
  lea esi, [esi + mmzero wrt ..gotoff]
755
  pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
784
  pmaxsw mm2, mm3                   ;|c|        ;B4 (2nd)
756
  pmullw mm2, [edi]                 ;*= 2Q      ;B8 (3rd+)
785
  pmullw mm2, [edi]                 ;*= 2Q      ;B8 (3rd+)
757
  psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
786
  psraw mm3, 15                     ; sign(c)   ;B7 (2nd)
(-)xvidcore-1.1.3-old/src/quant/x86_asm/quantize_h263_mmx.asm (-26 / +67 lines)
Lines 139-147 ALIGN 16 Link Here
139
quant_h263_intra_mmx:
139
quant_h263_intra_mmx:
140
140
141
  push esi
141
  push esi
142
  push edi
142
143
143
  mov esi, [esp + 4 + 8]     ; data
144
  mov esi, [esp + 8 + 8]     ; data
144
  mov ecx,[esp + 4 + 16]     ; dcscalar
145
  mov ecx,[esp + 8 + 16]     ; dcscalar
145
  movsx eax, word [esi]      ; data[0]
146
  movsx eax, word [esi]      ; data[0]
146
   
147
   
147
  sar ecx,1                  ; dcscalar /2
148
  sar ecx,1                  ; dcscalar /2
Lines 151-164 quant_h263_intra_mmx: Link Here
151
  sub eax,edx
152
  sub eax,edx
152
  add eax,ecx                ; + (dcscalar/2)*sgn(data[0])
153
  add eax,ecx                ; + (dcscalar/2)*sgn(data[0])
153
154
154
  mov ecx, [esp + 4 + 12]    ; quant
155
  mov ecx, [esp + 8 + 12]    ; quant
155
  cdq 
156
  cdq 
156
  idiv dword [esp + 4 + 16]  ; dcscalar
157
  idiv dword [esp + 8 + 16]  ; dcscalar
157
  cmp ecx, 1
158
  cmp ecx, 1
158
  mov edx, [esp + 4 + 4]     ; coeff
159
  mov edx, [esp + 8 + 4]     ; coeff
159
  je .low
160
  je .low
160
 
161
161
  movq mm7, [mmx_div+ecx * 8 - 8]
162
  call get_pc.di
163
  add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
164
165
  movq mm7, [edi + mmx_div+ecx * 8 - 8 wrt ..gotoff]
162
  mov ecx,4
166
  mov ecx,4
163
167
164
.loop
168
.loop
Lines 228-237 quant_h263_intra_mmx: Link Here
228
  jne .loop_low
232
  jne .loop_low
229
  
233
  
230
.end
234
.end
231
  mov edx, [esp + 4 + 4]     ; coeff
235
  mov edx, [esp + 8 + 4]     ; coeff
232
  mov [edx],ax  
236
  mov [edx],ax  
233
  xor eax,eax                ; return 0
237
  xor eax,eax                ; return 0
234
238
239
  pop edi
235
  pop esi
240
  pop esi
236
  ret
241
  ret
237
.endfunc
242
.endfunc
Lines 251-273 ALIGN 16 Link Here
251
quant_h263_intra_sse2:
256
quant_h263_intra_sse2:
252
257
253
  push esi
258
  push esi
259
  push edi
254
260
255
  mov esi, [esp + 4 + 8]     ; data
261
  mov esi, [esp + 8 + 8]     ; data
256
 
262
 
257
  movsx eax, word [esi]      ; data[0]
263
  movsx eax, word [esi]      ; data[0]
258
 
264
 
259
  mov ecx,[esp + 4 + 16]     ; dcscalar
265
  mov ecx,[esp + 8 + 16]     ; dcscalar
260
  mov edx,eax
266
  mov edx,eax
261
  sar ecx,1
267
  sar ecx,1
262
  add eax,ecx
268
  add eax,ecx
263
  sub edx,ecx
269
  sub edx,ecx
264
  cmovl eax,edx              ; +/- dcscalar/2
270
  cmovl eax,edx              ; +/- dcscalar/2
265
  mov ecx, [esp + 4 + 12]    ; quant
271
  mov ecx, [esp + 8 + 12]    ; quant
266
  cdq 
272
  cdq 
267
  idiv dword [esp + 4 + 16]  ; dcscalar
273
  idiv dword [esp + 8 + 16]  ; dcscalar
268
  cmp ecx, 1
274
  cmp ecx, 1
269
  mov edx, [esp + 4 + 4]     ; coeff
275
  mov edx, [esp + 8 + 4]     ; coeff
270
  movq xmm7, [mmx_div+ecx * 8 - 8]
276
277
  call get_pc.di
278
  add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
279
280
  movq xmm7, [edi + mmx_div+ecx * 8 - 8 wrt ..gotoff]
271
  je .low
281
  je .low
272
  
282
  
273
  mov ecx,2
283
  mov ecx,2
Lines 340-349 quant_h263_intra_sse2: Link Here
340
  jne .loop_low
350
  jne .loop_low
341
  
351
  
342
.end
352
.end
343
  mov edx, [esp + 4 + 4]     ; coeff
353
  mov edx, [esp + 8 + 4]     ; coeff
344
  mov [edx],ax  
354
  mov [edx],ax  
345
  xor eax,eax                ; return 0
355
  xor eax,eax                ; return 0
346
356
357
  pop edi
347
  pop esi
358
  pop esi
348
  ret
359
  ret
349
.endfunc
360
.endfunc
Lines 370-382 quant_h263_inter_mmx: Link Here
370
381
371
  xor ecx, ecx
382
  xor ecx, ecx
372
383
384
  call get_pc.dx
385
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
386
373
  pxor mm5, mm5                     ; sum
387
  pxor mm5, mm5                     ; sum
374
  movq mm6, [mmx_sub + eax * 8 - 8] ; sub
388
  movq mm6, [edx + mmx_sub + eax * 8 - 8 wrt ..gotoff] ; sub
375
389
376
  cmp al, 1
390
  cmp al, 1
377
  jz .q1loop
391
  jz .q1loop
378
392
379
  movq mm7, [mmx_div + eax * 8 - 8] ; divider
393
  movq mm7, [edx + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider
380
394
381
ALIGN 8
395
ALIGN 8
382
.loop
396
.loop
Lines 408-414 ALIGN 8 Link Here
408
  jnz .loop
422
  jnz .loop
409
423
410
.done
424
.done
411
  pmaddwd mm5, [plus_one]
425
  pmaddwd mm5, [edx + plus_one wrt ..gotoff]
412
  movq mm0, mm5
426
  movq mm0, mm5
413
  psrlq mm5, 32
427
  psrlq mm5, 32
414
  paddd mm0, mm5
428
  paddd mm0, mm5
Lines 477-483 quant_h263_inter_sse2: Link Here
477
491
478
  pxor xmm5, xmm5                           ; sum
492
  pxor xmm5, xmm5                           ; sum
479
493
480
  movq mm0, [mmx_sub + eax*8 - 8]           ; sub
494
  call get_pc.dx
495
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
496
497
  movq mm0, [edx + mmx_sub + eax*8 - 8 wrt ..gotoff]           ; sub
481
  movq2dq xmm6, mm0                         ; load into low 8 bytes
498
  movq2dq xmm6, mm0                         ; load into low 8 bytes
482
  movlhps xmm6, xmm6                        ; duplicate into high 8 bytes
499
  movlhps xmm6, xmm6                        ; duplicate into high 8 bytes
483
500
Lines 485-491 quant_h263_inter_sse2: Link Here
485
  jz near .qes2_q1loop
502
  jz near .qes2_q1loop
486
503
487
.qes2_not1
504
.qes2_not1
488
  movq mm0, [mmx_div + eax*8 - 8]           ; divider
505
  movq mm0, [edx + mmx_div + eax*8 - 8 wrt ..gotoff]           ; divider
489
  movq2dq xmm7, mm0
506
  movq2dq xmm7, mm0
490
  movlhps xmm7, xmm7
507
  movlhps xmm7, xmm7
491
508
Lines 519-525 ALIGN 16 Link Here
519
  jnz .qes2_loop
536
  jnz .qes2_loop
520
537
521
.qes2_done
538
.qes2_done
522
  movdqu xmm6, [plus_one]
539
  movdqu xmm6, [edx + plus_one wrt ..gotoff]
523
  pmaddwd xmm5, xmm6
540
  pmaddwd xmm5, xmm6
524
  movhlps xmm6, xmm5
541
  movhlps xmm6, xmm5
525
  paddd xmm5, xmm6
542
  paddd xmm5, xmm6
Lines 583-590 dequant_h263_intra_mmx: Link Here
583
600
584
  mov ecx, [esp+12]                 ; quant
601
  mov ecx, [esp+12]                 ; quant
585
  mov eax, [esp+ 8]                 ; coeff
602
  mov eax, [esp+ 8]                 ; coeff
603
604
  call get_pc.dx
605
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
606
586
  pcmpeqw mm0,mm0
607
  pcmpeqw mm0,mm0
587
  movq mm6, [mmx_quant + ecx*8]     ; quant
608
  movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff]     ; quant
588
  shl ecx,31                        ; quant & 1 ? 0 : - 1
609
  shl ecx,31                        ; quant & 1 ? 0 : - 1
589
  movq mm7,mm6
610
  movq mm7,mm6
590
  movq mm5,mm0
611
  movq mm5,mm0
Lines 841-848 dequant_h263_inter_mmx: Link Here
841
862
842
  mov ecx, [esp+12]                 ; quant
863
  mov ecx, [esp+12]                 ; quant
843
  mov eax, [esp+ 8]                 ; coeff
864
  mov eax, [esp+ 8]                 ; coeff
865
866
  call get_pc.dx
867
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
868
844
  pcmpeqw mm0,mm0
869
  pcmpeqw mm0,mm0
845
  movq mm6, [mmx_quant + ecx*8]     ; quant
870
  movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff]     ; quant
846
  shl ecx,31                        ; odd/even
871
  shl ecx,31                        ; odd/even
847
  movq mm7,mm6
872
  movq mm7,mm6
848
  movd mm1,ecx
873
  movd mm1,ecx
Lines 912-919 dequant_h263_inter_xmm: Link Here
912
937
913
  mov ecx, [esp+12]                 ; quant
938
  mov ecx, [esp+12]                 ; quant
914
  mov eax, [esp+ 8]                 ; coeff
939
  mov eax, [esp+ 8]                 ; coeff
940
941
  call get_pc.dx
942
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
943
915
  pcmpeqw mm0,mm0
944
  pcmpeqw mm0,mm0
916
  movq mm6, [mmx_quant + ecx*8]     ; quant
945
  movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff]     ; quant
917
  shl ecx,31
946
  shl ecx,31
918
  movq mm5,mm0
947
  movq mm5,mm0
919
  movd mm1,ecx
948
  movd mm1,ecx
Lines 967-973 dequant_h263_inter_xmm: Link Here
967
  ret
996
  ret
968
.endfunc
997
.endfunc
969
998
970
 
999
extern  _GLOBAL_OFFSET_TABLE_
1000
get_pc
1001
.di:
1002
  mov edi, [esp]
1003
  retn
1004
1005
.dx:
1006
  mov edx, [esp]
1007
  retn
1008
971
;-----------------------------------------------------------------------------
1009
;-----------------------------------------------------------------------------
972
;
1010
;
973
; uint32_t dequant_h263_inter_sse2(int16_t * data,
1011
; uint32_t dequant_h263_inter_sse2(int16_t * data,
Lines 983-989 dequant_h263_inter_sse2: Link Here
983
  mov ecx, [esp+12]                 ; quant
1021
  mov ecx, [esp+12]                 ; quant
984
  mov eax, [esp+ 8]                 ; coeff
1022
  mov eax, [esp+ 8]                 ; coeff
985
1023
986
  movq xmm6, [mmx_quant + ecx*8]    ; quant
1024
  call get_pc.dx
1025
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
1026
1027
  movq xmm6, [edx + mmx_quant + ecx*8 wrt ..gotoff]    ; quant
987
  inc ecx
1028
  inc ecx
988
  pcmpeqw xmm5,xmm5
1029
  pcmpeqw xmm5,xmm5
989
  and ecx,1
1030
  and ecx,1
(-)xvidcore-1.1.3-old/src/quant/x86_asm/quantize_mpeg_mmx.asm (-15 / +39 lines)
Lines 162-168 quant_mpeg_intra_mmx: Link Here
162
  mov eax, [esp + 16 + 12]      ; quant
162
  mov eax, [esp + 16 + 12]      ; quant
163
  mov ebx, [esp + 16 + 20]		; mpeg_quant_matrices
163
  mov ebx, [esp + 16 + 20]		; mpeg_quant_matrices
164
164
165
  movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5
165
  push ebp
166
  call get_pc.bp
167
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
168
169
  movq mm5, [ebp + quantd + eax * 8 - 8 wrt ..gotoff] ; quantd -> mm5
166
170
167
  xor ecx, ecx
171
  xor ecx, ecx
168
  cmp al, 1
172
  cmp al, 1
Lines 171-177 quant_mpeg_intra_mmx: Link Here
171
  cmp al, 2
175
  cmp al, 2
172
  jz near .q2loop
176
  jz near .q2loop
173
177
174
  movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7
178
  movq mm7, [ebp + mmx_div + eax * 8 - 8 wrt ..gotoff] ; multipliers[quant] -> mm7
175
179
176
ALIGN 16
180
ALIGN 16
177
.loop
181
.loop
Lines 234-239 ALIGN 16 Link Here
234
238
235
  mov [edi], ax             ; coeff[0] = ax
239
  mov [edi], ax             ; coeff[0] = ax
236
240
241
  pop ebp
237
  pop ebx
242
  pop ebx
238
  pop edi
243
  pop edi
239
  pop esi
244
  pop esi
Lines 346-351 quant_mpeg_inter_mmx: Link Here
346
  mov eax, [esp + 16 + 12]  ; quant
351
  mov eax, [esp + 16 + 12]  ; quant
347
  mov ebx, [esp + 16 + 16]		; mpeg_quant_matrices
352
  mov ebx, [esp + 16 + 16]		; mpeg_quant_matrices
348
353
354
  push ebp
355
  call get_pc.bp
356
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
357
349
  xor ecx, ecx
358
  xor ecx, ecx
350
359
351
  pxor mm5, mm5                 ; sum
360
  pxor mm5, mm5                 ; sum
Lines 356-362 quant_mpeg_inter_mmx: Link Here
356
  cmp al, 2
365
  cmp al, 2
357
  jz near .q2loop
366
  jz near .q2loop
358
367
359
  movq mm7, [mmx_div + eax * 8 - 8] ; divider
368
  movq mm7, [ebp + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider
360
369
361
ALIGN 16
370
ALIGN 16
362
.loop
371
.loop
Lines 400-416 ALIGN 16 Link Here
400
  jnz near .loop
409
  jnz near .loop
401
410
402
.done
411
.done
403
  pmaddwd mm5, [mmx_one]
412
  pmaddwd mm5, [ebp + mmx_one wrt ..gotoff]
404
  movq mm0, mm5
413
  movq mm0, mm5
405
  psrlq mm5, 32
414
  psrlq mm5, 32
406
  paddd mm0, mm5
415
  paddd mm0, mm5
407
  movd eax, mm0                 ; return sum
416
  movd eax, mm0                 ; return sum
408
417
418
  pop ebp
409
  pop ebx
419
  pop ebx
410
  pop edi
420
  pop edi
411
  pop esi
421
  pop esi
412
  pop ecx
422
  pop ecx
413
414
  ret
423
  ret
415
424
416
ALIGN 16
425
ALIGN 16
Lines 556-562 dequant_mpeg_intra_mmx: Link Here
556
  mov eax, [esp + 4 + 12] ; quant
565
  mov eax, [esp + 4 + 12] ; quant
557
  mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices
566
  mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices
558
567
559
  movq mm7, [mmx_mul_quant  + eax*8 - 8]
568
  push ebp
569
  call get_pc.bp
570
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
571
572
  movq mm7, [ebp + mmx_mul_quant  + eax*8 - 8 wrt ..gotoff]
560
  mov eax, -16      ; to keep ALIGNed, we regularly process coeff[0]
573
  mov eax, -16      ; to keep ALIGNed, we regularly process coeff[0]
561
  psllw mm7, 2      ; << 2. See comment.
574
  psllw mm7, 2      ; << 2. See comment.
562
  pxor mm6, mm6     ; this is a NOP
575
  pxor mm6, mm6     ; this is a NOP
Lines 595-605 ALIGN 16 Link Here
595
  pmullw mm6, mm3   ; low  of coeff*(matrix*quant)
608
  pmullw mm6, mm3   ; low  of coeff*(matrix*quant)
596
  pmulhw mm3, mm5   ; high of coeff*(matrix*quant)
609
  pmulhw mm3, mm5   ; high of coeff*(matrix*quant)
597
610
598
  pcmpgtw mm0, [zero]
611
  pcmpgtw mm0, [ebp + zero wrt ..gotoff]
599
  paddusw mm2, mm0
612
  paddusw mm2, mm0
600
  psrlw mm2, 5
613
  psrlw mm2, 5
601
614
602
  pcmpgtw mm3, [zero]
615
  pcmpgtw mm3, [ebp + zero wrt ..gotoff]
603
  paddusw mm6, mm3
616
  paddusw mm6, mm3
604
  psrlw mm6, 5
617
  psrlw mm6, 5
605
618
Lines 620-641 ALIGN 16 Link Here
620
    ; deal with DC
633
    ; deal with DC
621
  movd mm0, [ecx]
634
  movd mm0, [ecx]
622
  pmullw mm0, [esp + 4 + 16]  ; dcscalar
635
  pmullw mm0, [esp + 4 + 16]  ; dcscalar
623
  movq mm2, [mmx_32767_minus_2047]
636
  movq mm2, [ebp + mmx_32767_minus_2047 wrt ..gotoff]
624
  paddsw mm0, mm2
637
  paddsw mm0, mm2
625
  psubsw mm0, mm2
638
  psubsw mm0, mm2
626
  movq mm2, [mmx_32768_minus_2048]
639
  movq mm2, [ebp + mmx_32768_minus_2048 wrt ..gotoff]
627
  psubsw mm0, mm2
640
  psubsw mm0, mm2
628
  paddsw mm0, mm2
641
  paddsw mm0, mm2
629
  movd eax, mm0
642
  movd eax, mm0
630
  mov [edx], ax
643
  mov [edx], ax
631
644
632
  xor eax, eax
645
  xor eax, eax
633
  
646
647
  pop ebp
634
  pop ebx
648
  pop ebx
635
649
636
  ret
650
  ret
637
.endfunc
651
.endfunc
638
652
653
extern  _GLOBAL_OFFSET_TABLE_
654
get_pc.bp:
655
  mov ebp, [esp]
656
  retn
657
639
;-----------------------------------------------------------------------------
658
;-----------------------------------------------------------------------------
640
;
659
;
641
; uint32_t dequant_mpeg_inter_mmx(int16_t * data,
660
; uint32_t dequant_mpeg_inter_mmx(int16_t * data,
Lines 660-666 dequant_mpeg_inter_mmx: Link Here
660
  mov eax, [esp + 4 + 12]        ; quant
679
  mov eax, [esp + 4 + 12]        ; quant
661
  mov ebx, [esp + 4 + 16]		   ; mpeg_quant_matrices
680
  mov ebx, [esp + 4 + 16]		   ; mpeg_quant_matrices
662
681
663
  movq mm7, [mmx_mul_quant  + eax*8 - 8]
682
  push ebp
683
  call get_pc.bp
684
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
685
686
  movq mm7, [ebp + mmx_mul_quant  + eax*8 - 8 wrt ..gotoff]
664
  mov eax, -16
687
  mov eax, -16
665
  paddw mm7, mm7    ; << 1
688
  paddw mm7, mm7    ; << 1
666
  pxor mm6, mm6     ; mismatch sum
689
  pxor mm6, mm6     ; mismatch sum
Lines 702-708 ALIGN 16 Link Here
702
  movq mm4, mm7     ; (matrix*quant)
725
  movq mm4, mm7     ; (matrix*quant)
703
  pmullw mm4,  [ebx + 512 + 8*eax + 8*16 -2*8 + 8]
726
  pmullw mm4,  [ebx + 512 + 8*eax + 8*16 -2*8 + 8]
704
727
705
  pcmpgtw mm5, [zero]
728
  pcmpgtw mm5, [ebp + zero wrt ..gotoff]
706
  paddusw mm0, mm5
729
  paddusw mm0, mm5
707
  psrlw mm0, 5
730
  psrlw mm0, 5
708
  pxor mm0, mm1     ; start restoring sign
731
  pxor mm0, mm1     ; start restoring sign
Lines 713-719 ALIGN 16 Link Here
713
  pmullw mm2, mm4   ; low  of c*(matrix*quant)
736
  pmullw mm2, mm4   ; low  of c*(matrix*quant)
714
  psubw mm0, mm1    ; finish restoring sign
737
  psubw mm0, mm1    ; finish restoring sign
715
738
716
  pcmpgtw mm5, [zero]
739
  pcmpgtw mm5, [ebp + zero wrt ..gotoff]
717
  paddusw mm2, mm5
740
  paddusw mm2, mm5
718
  psrlw mm2, 5
741
  psrlw mm2, 5
719
  pxor mm2, mm3     ; start restoring sign
742
  pxor mm2, mm3     ; start restoring sign
Lines 744-750 ALIGN 16 Link Here
744
  xor word [edx + 2*63], ax
767
  xor word [edx + 2*63], ax
745
768
746
  xor eax, eax
769
  xor eax, eax
747
  
770
771
  pop ebp
748
  pop ebx
772
  pop ebx
749
773
750
  ret
774
  ret
(-)xvidcore-1.1.3-old/src/quant/x86_asm/quantize_mpeg_xmm.asm (-29 / +57 lines)
Lines 188-195 quant_mpeg_intra_xmm: Link Here
188
  push esi
188
  push esi
189
  push edi
189
  push edi
190
  push ebx
190
  push ebx
191
  nop
191
192
  mov edi, [esp + 12 + 20]		; mpeg_quant_matrices
192
  push ebp
193
  call get_pc.bp
194
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
195
196
  mov edi, [esp + 16 + 20]		; mpeg_quant_matrices
193
  mov esi, -14
197
  mov esi, -14
194
  pxor mm0, mm0
198
  pxor mm0, mm0
195
  pxor mm3, mm3
199
  pxor mm3, mm3
Lines 226-233 ALIGN 16 Link Here
226
  psubw mm0, mm2
230
  psubw mm0, mm2
227
  psubw mm3, mm6
231
  psubw mm3, mm6
228
  nop4
232
  nop4
229
  movq mm2, [quantd + ecx * 8 - 8]
233
  movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff]
230
  movq mm6, [mmx_divs + ecx * 8 - 8]
234
  movq mm6, [ebp + mmx_divs + ecx * 8 - 8 wrt ..gotoff]
231
  paddw mm5, mm2
235
  paddw mm5, mm2
232
  paddw mm7, mm2
236
  paddw mm7, mm2
233
  mov esp, esp
237
  mov esp, esp
Lines 250-276 ALIGN 16 Link Here
250
254
251
.done
255
.done
252
; calculate  data[0] // (int32_t)dcscalar)
256
; calculate  data[0] // (int32_t)dcscalar)
253
  mov esi, [esp + 12 + 16]  ; dcscalar
257
  mov esi, [esp + 16 + 16]  ; dcscalar
254
  movsx ecx, word [eax]
258
  movsx ecx, word [eax]
255
  mov edi, ecx
259
  mov edi, ecx
256
  mov edx, [esp + 12 + 16]
260
  mov edx, [esp + 16 + 16]
257
  shr edx, 1            ; ebx = dcscalar /2
261
  shr edx, 1            ; ebx = dcscalar /2
258
  sar edi, 31           ; cdq is vectorpath
262
  sar edi, 31           ; cdq is vectorpath
259
  xor edx, edi          ; ebx = eax V -eax -1
263
  xor edx, edi          ; ebx = eax V -eax -1
260
  sub ecx, edi
264
  sub ecx, edi
261
  add ecx, edx
265
  add ecx, edx
262
  mov edx, [dword esp + 12 + 4]
266
  mov edx, [dword esp + 16 + 4]
263
  mov esi, [int_div+4*esi]
267
  mov esi, [ebp + int_div+4*esi wrt ..gotoff]
264
  imul ecx, esi
268
  imul ecx, esi
265
  sar ecx, 17
269
  sar ecx, 17
266
  lea ebx, [byte ecx + 1]
270
  lea ebx, [byte ecx + 1]
267
  cmovs ecx, ebx
271
  cmovs ecx, ebx
268
  ; idiv    cx          ; ecx = edi:ecx / dcscalar
272
  ; idiv    cx          ; ecx = edi:ecx / dcscalar
269
273
270
  mov ebx, [esp]
274
  mov ebp, [esp]
271
  mov edi, [esp+4]
275
  mov ebx, [esp+4]
272
  mov esi, [esp+8]
276
  mov edi, [esp+8]
273
  add esp, byte 12
277
  mov esi, [esp+12]
278
  add esp, byte 16
274
  mov [edx], cx     ; coeff[0] = ax
279
  mov [edx], cx     ; coeff[0] = ax
275
280
276
  xor eax, eax
281
  xor eax, eax
Lines 303-309 ALIGN 16 Link Here
303
  psubw mm0, mm2        ;mismatch
308
  psubw mm0, mm2        ;mismatch
304
  psubw mm3, mm6
309
  psubw mm3, mm6
305
  nop4
310
  nop4
306
  movq mm2, [quantd + ecx * 8 - 8]
311
  movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff]
307
  paddw mm5, mm2        ;first approx with quantd
312
  paddw mm5, mm2        ;first approx with quantd
308
  paddw mm7, mm2
313
  paddw mm7, mm2
309
  mov esp, esp
314
  mov esp, esp
Lines 353-360 ALIGN 8 Link Here
353
  psubw mm0, mm2        ;mismatch
358
  psubw mm0, mm2        ;mismatch
354
  psubw mm3, mm6
359
  psubw mm3, mm6
355
  nop4
360
  nop4
356
  movq mm2, [quantd + ecx * 8 - 8]
361
  movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff]
357
  movq mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16
362
  movq mm6, [ebp + mmx_div + ecx * 8 - 8 wrt ..gotoff] ; divs for q<=16
358
  paddw mm5, mm2        ;first approx with quantd
363
  paddw mm5, mm2        ;first approx with quantd
359
  paddw mm7, mm2
364
  paddw mm7, mm2
360
  mov esp, esp
365
  mov esp, esp
Lines 397-404 quant_mpeg_inter_xmm: Link Here
397
  push esi
402
  push esi
398
  push edi
403
  push edi
399
  push ebx
404
  push ebx
400
  nop
405
401
  mov edi, [esp + 12 + 16]
406
  push ebp
407
  call get_pc.bp
408
  add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
409
410
  mov edi, [esp + 16 + 16]
402
  mov esi, -14
411
  mov esi, -14
403
  mov ebx, esp
412
  mov ebx, esp
404
  sub esp, byte 24
413
  sub esp, byte 24
Lines 440-447 ALIGN 16 Link Here
440
  pmullw mm6, mm7
449
  pmullw mm6, mm7
441
  psubw mm0, mm2
450
  psubw mm0, mm2
442
  psubw mm3, mm6
451
  psubw mm3, mm6
443
  movq mm2, [byte ebx]
452
  movq mm2, [ebp + ebx wrt ..gotoff]
444
  movq mm6, [mmx_divs + ecx * 8 - 8]
453
  movq mm6, [ebp + mmx_divs + ecx * 8 - 8 wrt ..gotoff]
445
  pmulhuw mm0, [edi + 768 + 8*esi+112]
454
  pmulhuw mm0, [edi + 768 + 8*esi+112]
446
  pmulhuw mm3, [edi + 768 + 8*esi+120]
455
  pmulhuw mm3, [edi + 768 + 8*esi+120]
447
  paddw mm2, [ebx+8]    ;sum
456
  paddw mm2, [ebx+8]    ;sum
Lines 466-476 ALIGN 16 Link Here
466
.done
475
.done
467
; calculate  data[0] // (int32_t)dcscalar)
476
; calculate  data[0] // (int32_t)dcscalar)
468
  paddw mm2, [ebx]
477
  paddw mm2, [ebx]
469
  mov ebx, [esp+24]
478
  mov ebx, [esp+4+24]
470
  mov edi, [esp+4+24]
479
  mov edi, [esp+8+24]
471
  mov esi, [esp+8+24]
480
  mov esi, [esp+12+24]
472
  add esp, byte 12+24
481
  pmaddwd mm2, [ebp + mmx_one wrt ..gotoff]
473
  pmaddwd mm2, [mmx_one]
482
  mov ebp, [esp+24]
483
  add esp, byte 16+24
474
  punpckldq mm0, mm2 ;get low dw to mm0:high
484
  punpckldq mm0, mm2 ;get low dw to mm0:high
475
  paddd mm0,mm2
485
  paddd mm0,mm2
476
  punpckhdq mm0, mm0 ;get result to low
486
  punpckhdq mm0, mm0 ;get result to low
Lines 554-560 ALIGN 8 Link Here
554
  psubw mm0,mm2         ;mismatch
564
  psubw mm0,mm2         ;mismatch
555
  psubw mm3,mm6
565
  psubw mm3,mm6
556
  movq mm2,[byte ebx]
566
  movq mm2,[byte ebx]
557
  movq mm6,[mmx_div + ecx * 8 - 8]  ; divs for q<=16
567
  movq mm6,[ebp + mmx_div + ecx * 8 - 8 wrt ..gotoff]  ; divs for q<=16
558
  pmulhuw mm0,[edi + 768 + 8*esi+112] ;correction
568
  pmulhuw mm0,[edi + 768 + 8*esi+112] ;correction
559
  pmulhuw mm3,[edi + 768 + 8*esi+120]
569
  pmulhuw mm3,[edi + 768 + 8*esi+120]
560
  paddw mm2,[ebx+8]     ;sum
570
  paddw mm2,[ebx+8]     ;sum
Lines 644-650 ALIGN 16 Link Here
644
dequant_mpeg_intra_3dne:
654
dequant_mpeg_intra_3dne:
645
  mov eax, [esp+12] ; quant
655
  mov eax, [esp+12] ; quant
646
  mov ecx, [esp+8]  ; coeff
656
  mov ecx, [esp+8]  ; coeff
647
  movq mm7, [mmx_mul_quant  + eax*8 - 8]
657
658
  call get_pc.dx
659
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
660
661
  movq mm7, [edx + mmx_mul_quant  + eax*8 - 8 wrt ..gotoff]
648
  psllw mm7, 2      ; << 2. See comment.
662
  psllw mm7, 2      ; << 2. See comment.
649
  mov edx, [esp+4]  ; data
663
  mov edx, [esp+4]  ; data
650
  push ebx
664
  push ebx
Lines 700-705 ALIGN 4 Link Here
700
  ret
714
  ret
701
.endfunc
715
.endfunc
702
716
717
extern  _GLOBAL_OFFSET_TABLE_
718
get_pc
719
.dx:
720
  mov edx, [esp]
721
  retn
722
723
.bp:
724
  mov ebp, [esp]
725
  retn
726
703
;-----------------------------------------------------------------------------
727
;-----------------------------------------------------------------------------
704
;
728
;
705
; uint32_t dequant_mpeg_inter_3dne(int16_t * data,
729
; uint32_t dequant_mpeg_inter_3dne(int16_t * data,
Lines 716-731 ALIGN 4 Link Here
716
740
717
ALIGN 16
741
ALIGN 16
718
dequant_mpeg_inter_3dne:
742
dequant_mpeg_inter_3dne:
719
  mov edx, [esp+ 4]        ; data
720
  mov ecx, [esp+ 8]        ; coeff
743
  mov ecx, [esp+ 8]        ; coeff
721
  mov eax, [esp+12]        ; quant
744
  mov eax, [esp+12]        ; quant
722
  movq mm7, [mmx_mul_quant  + eax*8 - 8]
745
746
  call get_pc.dx
747
  add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
748
749
  movq mm7, [edx + mmx_mul_quant  + eax*8 - 8 wrt ..gotoff]
723
  mov eax, -14
750
  mov eax, -14
724
  paddw mm7, mm7    ; << 1
751
  paddw mm7, mm7    ; << 1
725
  pxor mm6, mm6     ; mismatch sum
752
  pxor mm6, mm6     ; mismatch sum
726
  push esi
753
  push esi
727
  push edi
754
  push edi
728
  mov esi, mmzero
755
  lea esi, [edx + mmzero wrt ..gotoff]
756
  mov edx, [esp + 8 + 4]  ; data
729
  pxor mm1, mm1
757
  pxor mm1, mm1
730
  pxor mm3, mm3
758
  pxor mm3, mm3
731
  mov edi, [esp + 8 + 16] ; mpeg_quant_matrices
759
  mov edi, [esp + 8 + 16] ; mpeg_quant_matrices
(-)xvidcore-1.1.3-old/src/utils/x86_asm/cpuid.asm (-20 / +5 lines)
Lines 66-85 BITS 32 Link Here
66
%define XVID_CPU_TSC      (1<< 6)
66
%define XVID_CPU_TSC      (1<< 6)
67
67
68
;=============================================================================
68
;=============================================================================
69
; Read only data
70
;=============================================================================
71
72
ALIGN 32
73
%ifdef FORMAT_COFF
74
SECTION .rodata
75
%else
76
SECTION .rodata align=16
77
%endif
78
79
vendorAMD:
80
		db "AuthenticAMD"
81
82
;=============================================================================
83
; Macros
69
; Macros
84
;=============================================================================
70
;=============================================================================
85
71
Lines 161-171 check_cpu_features: Link Here
161
  cpuid
147
  cpuid
162
148
163
 ; AMD cpu ?
149
 ; AMD cpu ?
164
  lea esi, [vendorAMD]
150
  cmp dword [esp],"Auth"
165
  lea edi, [esp]
151
  jnz .cpu_quit
166
  mov ecx, 12
152
  cmp dword [esp+4],"enti"
167
  cld
153
  jnz .cpu_quit
168
  repe cmpsb
154
  cmp dword [esp+8],"cAMD"
169
  jnz .cpu_quit
155
  jnz .cpu_quit
170
156
171
  ; 3DNow! support ?
157
  ; 3DNow! support ?
Lines 208-214 sse2_os_trigger: Link Here
208
  ret
194
  ret
209
.endfunc
195
.endfunc
210
196
211
212
; enter/exit mmx state
197
; enter/exit mmx state
213
ALIGN 16
198
ALIGN 16
214
cglobal emms_mmx
199
cglobal emms_mmx
(-)xvidcore-1.1.3-old/src/utils/x86_asm/interlacing_mmx.asm (-3 / +13 lines)
Lines 129-134 cglobal MBFieldTest_mmx Link Here
129
  paddw mm7, mm3
129
  paddw mm7, mm3
130
%endmacro
130
%endmacro
131
131
132
extern  _GLOBAL_OFFSET_TABLE_
133
get_pc.bx:
134
  mov ebx, [esp]
135
  retn
136
132
;-----------------------------------------------------------------------------
137
;-----------------------------------------------------------------------------
133
;
138
;
134
; uint32_t MBFieldTest_mmx(int16_t * const data);
139
; uint32_t MBFieldTest_mmx(int16_t * const data);
Lines 141-147 MBFieldTest_mmx: Link Here
141
  push esi
146
  push esi
142
  push edi
147
  push edi
143
148
144
  mov esi, [esp+8+4]            ; esi = top left block
149
  push ebx
150
  call get_pc.bx
151
  add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
152
153
  mov esi, [esp+12+4]            ; esi = top left block
145
  mov edi, esi
154
  mov edi, esi
146
  add edi, 256                  ; edi = bottom left block
155
  add edi, 256                  ; edi = bottom left block
147
156
Lines 184-190 MBFieldTest_mmx: Link Here
184
  psubw m14, mm4
193
  psubw m14, mm4
185
  paddw mm6, m14                ; add to frame total
194
  paddw mm6, m14                ; add to frame total
186
195
187
  mov ecx, [nexts+eax*4]        ; move esi/edi 8 pixels to the right
196
  mov ecx, [ebx+nexts+eax*4 wrt ..gotoff]        ; move esi/edi 8 pixels to the right
188
  add esi, ecx
197
  add esi, ecx
189
  add edi, ecx
198
  add edi, ecx
190
199
Lines 192-198 MBFieldTest_mmx: Link Here
192
  jnz near .loop
201
  jnz near .loop
193
202
194
.decide:
203
.decide:
195
  movq mm0, [ones]              ; add packed words into single dwords
204
  movq mm0, [ebx + ones wrt ..gotoff]              ; add packed words into single dwords
196
  pmaddwd mm6, mm0
205
  pmaddwd mm6, mm0
197
  pmaddwd mm7, mm0
206
  pmaddwd mm7, mm0
198
207
Lines 211-216 MBFieldTest_mmx: Link Here
211
  inc eax                       ; if frame>=field, use field dct (return 1)
220
  inc eax                       ; if frame>=field, use field dct (return 1)
212
221
213
.end:
222
.end:
223
  pop ebx
214
  pop edi
224
  pop edi
215
  pop esi
225
  pop esi
216
226
(-)xvidcore-1.1.3-old/src/utils/x86_asm/mem_transfer_mmx.asm (-18 / +7 lines)
Lines 46-65 BITS 32 Link Here
46
%endmacro
46
%endmacro
47
47
48
;=============================================================================
48
;=============================================================================
49
; Read only data
50
;=============================================================================
51
52
%ifdef FORMAT_COFF
53
SECTION .rodata
54
%else
55
SECTION .rodata align=16
56
%endif
57
58
ALIGN 16
59
mmx_one:
60
	dw 1, 1, 1, 1
61
62
;=============================================================================
63
; Code
49
; Code
64
;=============================================================================
50
;=============================================================================
65
51
Lines 260-267 transfer_8to16subro_mmx: Link Here
260
  punpckhbw mm3, mm7
246
  punpckhbw mm3, mm7
261
  paddusw mm4, mm1
247
  paddusw mm4, mm1
262
  paddusw mm6, mm3
248
  paddusw mm6, mm3
263
  paddusw mm4, [mmx_one]
249
  paddusw mm4, [esp]
264
  paddusw mm6, [mmx_one]
250
  paddusw mm6, [esp]
265
  psrlw mm4, 1
251
  psrlw mm4, 1
266
  psrlw mm6, 1
252
  psrlw mm6, 1
267
  packuswb mm4, mm6
253
  packuswb mm4, mm6
Lines 278-285 transfer_8to16subro_mmx: Link Here
278
  punpckhbw mm3, mm7
264
  punpckhbw mm3, mm7
279
  paddusw mm5, mm1
265
  paddusw mm5, mm1
280
  paddusw mm6, mm3
266
  paddusw mm6, mm3
281
  paddusw mm5, [mmx_one]
267
  paddusw mm5, [esp]
282
  paddusw mm6, [mmx_one]
268
  paddusw mm6, [esp]
283
  lea esi, [esi+2*edx]
269
  lea esi, [esi+2*edx]
284
  psrlw mm5, 1
270
  psrlw mm5, 1
285
  psrlw mm6, 1
271
  psrlw mm6, 1
Lines 323-332 transfer_8to16sub2_mmx: Link Here
323
  mov edx, [esp+8+20] ; Stride
309
  mov edx, [esp+8+20] ; Stride
324
  pxor mm7, mm7
310
  pxor mm7, mm7
325
311
312
  push dword 0x00010001
313
  push dword 0x00010001
326
  COPY_8_TO_16_SUB2_MMX 0
314
  COPY_8_TO_16_SUB2_MMX 0
327
  COPY_8_TO_16_SUB2_MMX 1
315
  COPY_8_TO_16_SUB2_MMX 1
328
  COPY_8_TO_16_SUB2_MMX 2
316
  COPY_8_TO_16_SUB2_MMX 2
329
  COPY_8_TO_16_SUB2_MMX 3
317
  COPY_8_TO_16_SUB2_MMX 3
318
  add esp, byte 8
330
319
331
  pop esi
320
  pop esi
332
  pop ebx
321
  pop ebx

Return to bug 202465