Go to:
Gentoo Home
Documentation
Forums
Lists
Bugs
Planet
Store
Wiki
Get Gentoo!
Gentoo's Bugzilla – Attachment 71295 Details for
Bug 90287
media-libs/xvid: ELF text relocations / executable stacks
Home
|
New
–
[Ex]
|
Browse
|
Search
|
Privacy Policy
|
[?]
|
Reports
|
Requests
|
Help
|
New Account
|
Log In
[x]
|
Forgot Password
Login:
[x]
[patch]
TEXTREL fix for xvid
xvid-1.1.0_beta2-pic-fix.patch (text/plain), 194.56 KB, created by
PaX Team
on 2005-10-23 10:38:02 UTC
(
hide
)
Description:
TEXTREL fix for xvid
Filename:
MIME Type:
Creator:
PaX Team
Created:
2005-10-23 10:38:02 UTC
Size:
194.56 KB
patch
obsolete
>diff -urp xvidcore-1.1.0-beta2-old/build/generic/configure.in xvidcore-1.1.0-beta2/build/generic/configure.in >--- xvidcore-1.1.0-beta2-old/build/generic/configure.in 2005-10-23 18:41:52.000000000 +0200 >+++ xvidcore-1.1.0-beta2/build/generic/configure.in 2005-10-23 18:42:38.000000000 +0200 >@@ -342,11 +342,11 @@ if test "$ARCHITECTURE" = "IA32" -o "$AR > chosen_asm_prog="" > > dnl Check for yasm first >- AC_CHECK_PROG([ac_yasm], [$yasm_prog], [yes], [no], , [yes]) >- if test "$ac_yasm" = "yes" ; then >- found_nasm_comp_prog="yes" >- chosen_asm_prog="$yasm_prog" >- fi >+dnl AC_CHECK_PROG([ac_yasm], [$yasm_prog], [yes], [no], , [yes]) >+dnl if test "$ac_yasm" = "yes" ; then >+dnl found_nasm_comp_prog="yes" >+dnl chosen_asm_prog="$yasm_prog" >+dnl fi > > dnl if yasm hasn't been found, then check for nasm (not buggy version) > if test "$found_nasm_comp_prog" = "no" -a "$ARCHITECTURE" != "X86_64" ; then >diff -urp xvidcore-1.1.0-beta2-old/src/bitstream/x86_asm/cbp_3dne.asm xvidcore-1.1.0-beta2/src/bitstream/x86_asm/cbp_3dne.asm >--- xvidcore-1.1.0-beta2-old/src/bitstream/x86_asm/cbp_3dne.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/bitstream/x86_asm/cbp_3dne.asm 2005-10-23 18:42:38.000000000 +0200 >@@ -123,3 +123,5 @@ calc_cbp_3dne: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/bitstream/x86_asm/cbp_mmx.asm xvidcore-1.1.0-beta2/src/bitstream/x86_asm/cbp_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/bitstream/x86_asm/cbp_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/bitstream/x86_asm/cbp_mmx.asm 2005-10-23 18:42:38.000000000 +0200 >@@ -49,21 +49,6 @@ BITS 32 > %endmacro > > ;============================================================================= >-; Local data >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >- >-ignore_dc: >- dw 0, -1, -1, -1, -1, -1, -1, -1 >- >-;============================================================================= > ; Code > ;============================================================================= > >@@ -84,7 +69,13 @@ calc_cbp_mmx: > xor eax, eax ; cbp = 0 > mov edx, (1 << 5) > >- movq mm7, [ignore_dc] >+ push byte 0 ; align esp to 16 bytes >+ push byte -1 >+ push byte -1 >+ push byte -1 >+ push dword 0xFFFF0000 >+ movq mm7, [esp] >+ add esp, byte 20 > > .loop > movq mm0, [esi] >@@ -134,3 +125,5 @@ calc_cbp_mmx: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/bitstream/x86_asm/cbp_sse2.asm xvidcore-1.1.0-beta2/src/bitstream/x86_asm/cbp_sse2.asm >--- xvidcore-1.1.0-beta2-old/src/bitstream/x86_asm/cbp_sse2.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/bitstream/x86_asm/cbp_sse2.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -69,20 +69,6 @@ BITS 32 > %endmacro > > ;============================================================================= >-; Data (Read Only) >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >-ignore_dc: >- dw 0, -1, -1, -1, -1, -1, -1, -1 >- >-;============================================================================= > ; Code > ;============================================================================= > >@@ -98,7 +84,13 @@ calc_cbp_sse2: > mov edx, [esp+4] ; coeff[] > xor eax, eax ; cbp = 0 > >- movdqu xmm7, [ignore_dc] ; mask to ignore dc value >+ sub esp,byte 12 ; align esp to 16 bytes >+ push byte -1 >+ push byte -1 >+ push byte -1 >+ push dword 0xFFFF0000 >+ movdqu xmm7, [esp] ; mask to ignore dc value >+ add esp, byte 28 > pxor xmm6, xmm6 ; zero > > LOOP_SSE2 0 >@@ -140,3 +132,5 @@ calc_cbp_sse2: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm xvidcore-1.1.0-beta2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm >--- xvidcore-1.1.0-beta2-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -204,7 +204,7 @@ fdct_r_row: > psllw mm4, SHIFT_FRW_COL > movq mm6, mm0 > psubsw mm2, mm1 >- movq mm1, [fdct_tg_all_16 + 4*2] >+ movq mm1, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff] > psubsw mm0, mm4 > movq mm7, [%2 + %3*2 + 3*16] > pmulhw mm1, mm0 >@@ -216,9 +216,9 @@ fdct_r_row: > psubsw mm5, mm7 > paddsw mm1, mm5 > paddsw mm4, mm7 >- por mm1, [fdct_one_corr] >+ por mm1, [ebx + fdct_one_corr wrt ..gotoff] > psllw mm2, SHIFT_FRW_COL + 1 >- pmulhw mm5, [fdct_tg_all_16 + 4*2] >+ pmulhw mm5, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff] > movq mm7, mm4 > psubsw mm3, [%2 + %3*2 + 5*16] > psubsw mm4, mm6 >@@ -230,34 +230,34 @@ fdct_r_row: > movq mm6, mm2 > movq [%1 + %3*2 + 4*16], mm4 > paddsw mm2, mm3 >- pmulhw mm2, [ocos_4_16] >+ pmulhw mm2, [ebx + ocos_4_16 wrt ..gotoff] > psubsw mm6, mm3 >- pmulhw mm6, [ocos_4_16] >+ pmulhw mm6, [ebx + ocos_4_16 wrt ..gotoff] > psubsw mm5, mm0 >- por mm5, [fdct_one_corr] >+ por mm5, [ebx + fdct_one_corr wrt ..gotoff] > psllw mm1, SHIFT_FRW_COL >- por mm2, [fdct_one_corr] >+ por mm2, [ebx + fdct_one_corr wrt ..gotoff] > movq mm4, mm1 > movq mm3, [%2 + %3*2 + 0*16] > paddsw mm1, mm6 > psubsw mm3, [%2 + %3*2 + 7*16] > psubsw mm4, mm6 >- movq mm0, [fdct_tg_all_16 + 0*2] >+ movq mm0, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff] > psllw mm3, SHIFT_FRW_COL >- movq mm6, [fdct_tg_all_16 + 8*2] >+ movq mm6, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff] > pmulhw mm0, mm1 > movq [%1 + %3*2 + 0*16], mm7 > pmulhw mm6, mm4 > movq [%1 + %3*2 + 6*16], mm5 > movq mm7, mm3 >- movq mm5, [fdct_tg_all_16 + 8*2] >+ movq mm5, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff] > psubsw mm7, mm2 > paddsw mm3, mm2 > pmulhw mm5, mm7 > paddsw mm0, mm3 > paddsw mm6, mm4 >- pmulhw mm3, [fdct_tg_all_16 + 0*2] >- por mm0, [fdct_one_corr] >+ pmulhw mm3, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff] >+ por mm0, [ebx + fdct_one_corr wrt ..gotoff] > paddsw mm5, mm7 > psubsw mm7, mm6 > movq [%1 + %3*2 + 1*16], mm0 >@@ -287,28 +287,28 @@ fdct_r_row: > movq mm6, mm5 > punpckldq mm3, mm5 > punpckhdq mm6, mm3 >- movq mm3, [%3 + 0*2] >- movq mm4, [%3 + 4*2] >+ movq mm3, [0*2 + %3] >+ movq mm4, [4*2 + %3] > punpckldq mm2, mm0 > pmaddwd mm3, mm0 > punpckhdq mm1, mm2 >- movq mm2, [%3 + 16*2] >+ movq mm2, [16*2 + %3] > pmaddwd mm4, mm1 >- pmaddwd mm0, [%3 + 8*2] >- movq mm7, [%3 + 20*2] >+ pmaddwd mm0, [8*2 + %3] >+ movq mm7, [20*2 + %3] > pmaddwd mm2, mm5 >- paddd mm3, [fdct_r_row] >+ paddd mm3, [ebx + fdct_r_row wrt ..gotoff] > pmaddwd mm7, mm6 >- pmaddwd mm1, [%3 + 12*2] >+ pmaddwd mm1, [12*2 + %3] > paddd mm3, mm4 >- pmaddwd mm5, [%3 + 24*2] >- pmaddwd mm6, [%3 + 28*2] >+ pmaddwd mm5, [24*2 + %3] >+ pmaddwd mm6, [28*2 + %3] > paddd mm2, mm7 >- paddd mm0, [fdct_r_row] >+ paddd mm0, [ebx + fdct_r_row wrt ..gotoff] > psrad mm3, SHIFT_FRW_ROW >- paddd mm2, [fdct_r_row] >+ paddd mm2, [ebx + fdct_r_row wrt ..gotoff] > paddd mm0, mm1 >- paddd mm5, [fdct_r_row] >+ paddd mm5, [ebx + fdct_r_row wrt ..gotoff] > psrad mm2, SHIFT_FRW_ROW > paddd mm5, mm6 > psrad mm0, SHIFT_FRW_ROW >@@ -336,23 +336,23 @@ fdct_r_row: > psubsw mm1, mm5 > pshufw mm2, mm0, 0x4E > pshufw mm3, mm1, 0x4E >- movq mm4, [%3 + 0*2] >- movq mm6, [%3 + 4*2] >- movq mm5, [%3 + 16*2] >- movq mm7, [%3 + 20*2] >+ movq mm4, [ 0*2 + %3] >+ movq mm6, [ 4*2 + %3] >+ movq mm5, [16*2 + %3] >+ movq mm7, [20*2 + %3] > pmaddwd mm4, mm0 > pmaddwd mm5, mm1 > pmaddwd mm6, mm2 > pmaddwd mm7, mm3 >- pmaddwd mm0, [%3 + 8*2] >- pmaddwd mm2, [%3 + 12*2] >- pmaddwd mm1, [%3 + 24*2] >- pmaddwd mm3, [%3 + 28*2] >+ pmaddwd mm0, [ 8*2 + %3] >+ pmaddwd mm2, [12*2 + %3] >+ pmaddwd mm1, [24*2 + %3] >+ pmaddwd mm3, [28*2 + %3] > paddd mm4, mm6 > paddd mm5, mm7 > paddd mm0, mm2 > paddd mm1, mm3 >- movq mm7, [fdct_r_row] >+ movq mm7, [ebx + fdct_r_row wrt ..gotoff] > paddd mm4, mm7 > paddd mm5, mm7 > paddd mm0, mm7 >@@ -377,6 +377,10 @@ cglobal %1 > ;; Move the destination/source address to the eax register > mov eax, [esp + 4] > >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > ;; Process the columns (4 at a time) > FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3 > FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7 >@@ -386,12 +390,12 @@ cglobal %1 > %assign i 0 > %rep 8 > ;; Process the 'i'th row >- %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i >+ %2 eax+2*i*8, eax+2*i*8, ebx + tab_frw_01234567+2*32*i wrt ..gotoff > %assign i i+1 > %endrep > %else > mov ecx, 8 >- mov edx, tab_frw_01234567 >+ mov edx, [ebx + tab_frw_01234567 wrt ..gotoff] > ALIGN 8 > .loop > %2 eax, eax, edx >@@ -401,6 +405,7 @@ ALIGN 8 > jne .loop > %endif > >+ pop ebx > ret > .endfunc > %endmacro >@@ -411,6 +416,11 @@ ALIGN 8 > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; void fdct_mmx_ffmpeg(int16_t block[64]); > ;----------------------------------------------------------------------------- >@@ -422,3 +432,6 @@ MAKE_FDCT_FUNC fdct_mmx_ffmpeg, FDCT_ROW > ;----------------------------------------------------------------------------- > > MAKE_FDCT_FUNC fdct_xmm_ffmpeg, FDCT_ROW_XMM >+ >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/dct/x86_asm/fdct_mmx_skal.asm xvidcore-1.1.0-beta2/src/dct/x86_asm/fdct_mmx_skal.asm >--- xvidcore-1.1.0-beta2-old/src/dct/x86_asm/fdct_mmx_skal.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/dct/x86_asm/fdct_mmx_skal.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -294,15 +294,15 @@ MMX_One: > paddsw mm2, mm1 ; mm2: t6+t5 > movq [%1+0*16], mm5 ; => out0 > >- movq mm4, [tan2] ; mm4 <= tan2 >+ movq mm4, [ebx + tan2 wrt ..gotoff] ; mm4 <= tan2 > pmulhw mm4, mm7 ; tm03*tan2 >- movq mm5, [tan2] ; mm5 <= tan2 >+ movq mm5, [ebx + tan2 wrt ..gotoff] ; mm5 <= tan2 > psubsw mm4, mm6 ; out6 = tm03*tan2 - tm12 > pmulhw mm5, mm6 ; tm12*tan2 > paddsw mm5, mm7 ; out2 = tm12*tan2 + tm03 > >- movq mm6, [sqrt2] >- movq mm7, [MMX_One] >+ movq mm6, [ebx + sqrt2 wrt ..gotoff] >+ movq mm7, [ebx + MMX_One wrt ..gotoff] > > pmulhw mm2, mm6 ; mm2: tp65 = (t6 + t5)*cos4 > por mm5, mm7 ; correct out2 >@@ -320,8 +320,8 @@ MMX_One: > paddsw mm2, mm4 ; mm2: tp765 = t7 + tp65 > paddsw mm1, mm5 ; mm1: tp465 = t4 + tm65 > >- movq mm4, [tan3] ; tan3 - 1 >- movq mm5, [tan1] ; tan1 >+ movq mm4, [ebx + tan3 wrt ..gotoff] ; tan3 - 1 >+ movq mm5, [ebx + tan1 wrt ..gotoff] ; tan1 > > movq mm7, mm3 ; save tm465 > pmulhw mm3, mm4 ; tm465*(tan3-1) >@@ -364,23 +364,23 @@ MMX_One: > punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1] > punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3] > >- movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17] >- movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19] >+ movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17] >+ movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19] > pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17] >- movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21] >+ movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21] > pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19] >- movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23] >+ movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23] > pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21] >- movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25] >+ movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25] > pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23] >- movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27] >+ movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27] > pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25] > paddd mm2, mm3 ; [ out0 | out1 ] > pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] > psrad mm2, 16 >- pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] >+ pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29] > paddd mm4, mm5 ; [ out2 | out3 ] >- pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] >+ pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31] > psrad mm4, 16 > > paddd mm6, mm7 ; [ out4 | out5 ] >@@ -422,23 +422,23 @@ MMX_One: > punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1] > punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3] > >- movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17] >- movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19] >+ movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17] >+ movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19] > pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17] >- movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21] >+ movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21] > pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19] >- movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23] >+ movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23] > pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21] >- movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25] >+ movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25] > pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23] >- movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27] >+ movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27] > pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25] > paddd mm2, mm3 ; [ out0 | out1 ] > pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] > psrad mm2, 16 >- pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] >+ pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29] > paddd mm4, mm5 ; [ out2 | out3 ] >- pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] >+ pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31] > psrad mm4, 16 > > paddd mm6, mm7 ; [ out4 | out5 ] >@@ -467,12 +467,16 @@ MMX_One: > ALIGN 16 > cglobal %1 > %1: >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > %ifdef UNROLLED_LOOP >- mov ecx, [esp + 4] >+ mov ecx, [esp + 4 + 4] > %else >- push ebx >+ push esi > push edi >- mov ecx, [esp + 8 + 4] >+ mov ecx, [esp + 12 + 4] > %endif > > fLLM_PASS ecx+0, ecx+0, 3 >@@ -481,27 +485,28 @@ cglobal %1 > %ifdef UNROLLED_LOOP > %assign i 0 > %rep 8 >- %2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8 >+ %2 ecx+i*16, ecx+i*16, ebx + fdct_table+i*64 wrt ..gotoff, ebx + fdct_rounding_1+i*8 wrt ..gotoff, ebx + fdct_rounding_2+i*8 wrt ..gotoff > %assign i i+1 > %endrep > %else > mov eax, 8 >- mov edx, fdct_table >- mov ebx, fdct_rounding_1 >- mov edi, fdct_rounding_2 >+ lea edx, [ebx + fdct_table wrt ..gotoff] >+ lea esi, [ebx + fdct_rounding_1 wrt ..gotoff] >+ lea edi, [ebx + fdct_rounding_2 wrt ..gotoff] > .loop >- %2 ecx, ecx, edx, ebx, edi >+ %2 ecx, ecx, edx, esi, edi > add ecx, 2*8 > add edx, 2*32 >- add ebx, 2*4 >+ add esi, 2*4 > add edi, 2*4 > dec eax > jne .loop > > pop edi >- pop ebx >+ pop esi > %endif > >+ pop ebx > ret > .endfunc > %endmacro >@@ -512,6 +517,11 @@ cglobal %1 > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; void fdct_mmx_skal(int16_t block[64]]; > ;----------------------------------------------------------------------------- >@@ -523,3 +533,6 @@ MAKE_FDCT_FUNC fdct_mmx_skal, fMTX_MULT_ > ;----------------------------------------------------------------------------- > > MAKE_FDCT_FUNC fdct_xmm_skal, fMTX_MULT_XMM >+ >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/dct/x86_asm/fdct_sse2_skal.asm xvidcore-1.1.0-beta2/src/dct/x86_asm/fdct_sse2_skal.asm >--- xvidcore-1.1.0-beta2-old/src/dct/x86_asm/fdct_sse2_skal.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/dct/x86_asm/fdct_sse2_skal.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -302,10 +302,10 @@ cglobal fdct_sse2_skal > pshufd xmm6, xmm0, 01010101b ; [13131313] > pshufd xmm7, xmm0, 11111111b ; [57575757] > >- pmaddwd xmm4, [%2+ 0] ; dot [M00,M01][M04,M05][M08,M09][M12,M13] >- pmaddwd xmm5, [%2+16] ; dot [M02,M03][M06,M07][M10,M11][M14,M15] >- pmaddwd xmm6, [%2+32] ; dot [M16,M17][M20,M21][M24,M25][M28,M29] >- pmaddwd xmm7, [%2+48] ; dot [M18,M19][M22,M23][M26,M27][M30,M31] >+ pmaddwd xmm4, [ 0 + %2] ; dot [M00,M01][M04,M05][M08,M09][M12,M13] >+ pmaddwd xmm5, [16 + %2] ; dot [M02,M03][M06,M07][M10,M11][M14,M15] >+ pmaddwd xmm6, [32 + %2] ; dot [M16,M17][M20,M21][M24,M25][M28,M29] >+ pmaddwd xmm7, [48 + %2] ; dot [M18,M19][M22,M23][M26,M27][M30,M31] > paddd xmm4, [%3] ; Round > > paddd xmm6, xmm7 ; [b0|b1|b2|b3] >@@ -330,12 +330,12 @@ cglobal fdct_sse2_skal > > %macro iLLM_PASS 1 ; %1: src/dst > >- movdqa xmm0, [tan3] ; t3-1 >+ movdqa xmm0, [ebx + tan3 wrt ..gotoff] ; t3-1 > movdqa xmm3, [%1+16*3] ; x3 > movdqa xmm1, xmm0 ; t3-1 > movdqa xmm5, [%1+16*5] ; x5 > >- movdqa xmm4, [tan1] ; t1 >+ movdqa xmm4, [ebx + tan1 wrt ..gotoff] ; t1 > movdqa xmm6, [%1+16*1] ; x1 > movdqa xmm7, [%1+16*7] ; x7 > movdqa xmm2, xmm4 ; t1 >@@ -353,7 +353,7 @@ cglobal fdct_sse2_skal > psubsw xmm2, xmm7 ; x1*t1-x7 = tm17 > > >- movdqa xmm3, [sqrt2] >+ movdqa xmm3, [ebx + sqrt2 wrt ..gotoff] > movdqa xmm7, xmm4 > movdqa xmm6, xmm2 > psubsw xmm4, xmm1 ; tp17-tp35 = t1 >@@ -373,7 +373,7 @@ cglobal fdct_sse2_skal > paddsw xmm0, xmm0 ; 2.(t1+t2) = b1 > paddsw xmm4, xmm4 ; 2.(t1-t2) = b2 > >- movdqa xmm7, [tan2] ; t2 >+ movdqa xmm7, [ebx + tan2 wrt ..gotoff] ; t2 > movdqa xmm3, [%1+2*16] ; x2 > movdqa xmm6, [%1+6*16] ; x6 > movdqa xmm5, xmm7 ; t2 >@@ -445,14 +445,14 @@ cglobal fdct_sse2_skal > ALIGN 16 > idct_sse2_skal: > mov ecx, [esp+4] >- iMTX_MULT 0, iTab1, Idct_Rnd0, 11 >- iMTX_MULT 1, iTab2, Idct_Rnd1, 11 >- iMTX_MULT 2, iTab3, Idct_Rnd2, 11 >- iMTX_MULT 3, iTab4, Idct_Rnd3, 11 >- iMTX_MULT 4, iTab1, Idct_Rnd4, 11 >- iMTX_MULT 5, iTab4, Idct_Rnd5, 11 >- iMTX_MULT 6, iTab3, Idct_Rnd6, 11 >- iMTX_MULT 7, iTab2, Idct_Rnd7, 11 >+ iMTX_MULT 0, ebx + iTab1 wrt ..gotoff, ebx + Idct_Rnd0 wrt ..gotoff, 11 >+ iMTX_MULT 1, ebx + iTab2 wrt ..gotoff, ebx + Idct_Rnd1 wrt ..gotoff, 11 >+ iMTX_MULT 2, ebx + iTab3 wrt ..gotoff, ebx + Idct_Rnd2 wrt ..gotoff, 11 >+ iMTX_MULT 3, ebx + iTab4 wrt ..gotoff, ebx + Idct_Rnd3 wrt ..gotoff, 11 >+ iMTX_MULT 4, ebx + iTab1 wrt ..gotoff, ebx + Idct_Rnd4 wrt ..gotoff, 11 >+ iMTX_MULT 5, ebx + iTab4 wrt ..gotoff, ebx + Idct_Rnd5 wrt ..gotoff, 11 >+ iMTX_MULT 6, ebx + iTab3 wrt ..gotoff, ebx + Idct_Rnd6 wrt ..gotoff, 11 >+ iMTX_MULT 7, ebx + iTab2 wrt ..gotoff, ebx + Idct_Rnd7 wrt ..gotoff, 11 > iLLM_PASS ecx+0 > ret > .endfunc >@@ -476,59 +476,63 @@ idct_sse2_skal: > > ALIGN 16 > idct_sse2_sparse_skal: >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc > >- mov ecx, [esp+ 4] ; Src >+ mov ecx, [esp+ 4 +4] ; Src > > TEST_ROW ecx, .Row0_Round >- iMTX_MULT 0, iTab1, Idct_Rnd0, 11 >+ iMTX_MULT 0, ebx + iTab1 wrt ..gotoff, ebx + Idct_Rnd0 wrt ..gotoff, 11 > jmp .Row1 > .Row0_Round >- movq mm0, [Idct_Sparse_Rnd0] >+ movq mm0, [ebx + Idct_Sparse_Rnd0 wrt ..gotoff] > movq [ecx ], mm0 > movq [ecx+8], mm0 > > .Row1 > TEST_ROW ecx+16, .Row1_Round >- iMTX_MULT 1, iTab2, Idct_Rnd1, 11 >+ iMTX_MULT 1, ebx + iTab2 wrt ..gotoff, ebx + Idct_Rnd1 wrt ..gotoff, 11 > jmp .Row2 > .Row1_Round >- movq mm0, [Idct_Sparse_Rnd1] >+ movq mm0, [ebx + Idct_Sparse_Rnd1 wrt ..gotoff] > movq [ecx+16 ], mm0 > movq [ecx+16+8], mm0 > > .Row2 > TEST_ROW ecx+32, .Row2_Round >- iMTX_MULT 2, iTab3, Idct_Rnd2, 11 >+ iMTX_MULT 2, ebx + iTab3 wrt ..gotoff, ebx + Idct_Rnd2 wrt ..gotoff, 11 > jmp .Row3 > .Row2_Round >- movq mm0, [Idct_Sparse_Rnd2] >+ movq mm0, [ebx + Idct_Sparse_Rnd2 wrt ..gotoff] > movq [ecx+32 ], mm0 > movq [ecx+32+8], mm0 > > .Row3 > TEST_ROW ecx+48, .Row4 >- iMTX_MULT 3, iTab4, Idct_Rnd3, 11 >+ iMTX_MULT 3, ebx + iTab4 wrt ..gotoff, ebx + Idct_Rnd3 wrt ..gotoff, 11 > jmp .Row4 > > .Row4 > TEST_ROW ecx+64, .Row5 >- iMTX_MULT 4, iTab1, Idct_Rnd4, 11 >+ iMTX_MULT 4, ebx + iTab1 wrt ..gotoff, ebx + Idct_Rnd4 wrt ..gotoff, 11 > jmp .Row5 > > .Row5 > TEST_ROW ecx+80, .Row6 >- iMTX_MULT 5, iTab4, Idct_Rnd5, 11 >+ iMTX_MULT 5, ebx + iTab4 wrt ..gotoff, ebx + Idct_Rnd5 wrt ..gotoff, 11 > > .Row6 > TEST_ROW ecx+96, .Row7 >- iMTX_MULT 6, iTab3, Idct_Rnd6, 11 >+ iMTX_MULT 6, ebx + iTab3 wrt ..gotoff, ebx + Idct_Rnd6 wrt ..gotoff, 11 > > .Row7 > TEST_ROW ecx+112, .End >- iMTX_MULT 7, iTab2, Idct_Rnd7, 11 >+ iMTX_MULT 7, ebx + iTab2 wrt ..gotoff, ebx + Idct_Rnd7 wrt ..gotoff, 11 > .End > > iLLM_PASS ecx+0 >+ pop ebx > ret > .endfunc > >@@ -585,15 +589,15 @@ idct_sse2_sparse_skal: > paddsw xmm2, xmm1 ; xmm2: t6+t5 > movdqa [%1+0*16], xmm5 ; => out0 > >- movdqa xmm4, [tan2] ; xmm4 <= tan2 >+ movdqa xmm4, [ebx + tan2 wrt ..gotoff] ; xmm4 <= tan2 > pmulhw xmm4, xmm7 ; tm03*tan2 >- movdqa xmm5, [tan2] ; xmm5 <= tan2 >+ movdqa xmm5, [ebx + tan2 wrt ..gotoff] ; xmm5 <= tan2 > psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12 > pmulhw xmm5, xmm6 ; tm12*tan2 > paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03 > >- movdqa xmm6, [sqrt2] >- movdqa xmm7, [Rounder1] >+ movdqa xmm6, [ebx + sqrt2 wrt ..gotoff] >+ movdqa xmm7, [ebx + Rounder1 wrt ..gotoff] > > pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4 > por xmm5, xmm7 ; correct out2 >@@ -611,8 +615,8 @@ idct_sse2_sparse_skal: > paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65 > paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65 > >- movdqa xmm4, [tan3] ; tan3 - 1 >- movdqa xmm5, [tan1] ; tan1 >+ movdqa xmm4, [ebx + tan3 wrt ..gotoff] ; tan3 - 1 >+ movdqa xmm5, [ebx + tan1 wrt ..gotoff] ; tan1 > > movdqa xmm7, xmm3 ; save tm465 > pmulhw xmm3, xmm4 ; tm465*(tan3-1) >@@ -659,12 +663,12 @@ idct_sse2_sparse_skal: > ; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7'] > ; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ] > >- movdqa xmm1, [%2+16] >- movdqa xmm3, [%2+32] >+ movdqa xmm1, [16+%2] >+ movdqa xmm3, [32+%2] > pmaddwd xmm1, xmm2 > pmaddwd xmm3, xmm0 >- pmaddwd xmm2, [%2+48] >- pmaddwd xmm0, [%2+ 0] >+ pmaddwd xmm2, [48+%2] >+ pmaddwd xmm0, [ 0+%2] > > paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ] > paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ] >@@ -679,22 +683,35 @@ idct_sse2_sparse_skal: > movdqa [ecx+%1*16+0], xmm0 > %endmacro > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; Function Forward DCT > ;----------------------------------------------------------------------------- > > ALIGN 16 > fdct_sse2_skal: >- mov ecx, [esp+4] >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov ecx, [esp+4+4] > fLLM_PASS ecx+0, 3 >- fMTX_MULT 0, fTab1, Fdct_Rnd0 >- fMTX_MULT 1, fTab2, Fdct_Rnd2 >- fMTX_MULT 2, fTab3, Fdct_Rnd1 >- fMTX_MULT 3, fTab4, Fdct_Rnd1 >- fMTX_MULT 4, fTab1, Fdct_Rnd0 >- fMTX_MULT 5, fTab4, Fdct_Rnd1 >- fMTX_MULT 6, fTab3, Fdct_Rnd1 >- fMTX_MULT 7, fTab2, Fdct_Rnd1 >+ fMTX_MULT 0, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff >+ fMTX_MULT 1, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd2 wrt ..gotoff >+ fMTX_MULT 2, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff >+ fMTX_MULT 3, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff >+ fMTX_MULT 4, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff >+ fMTX_MULT 5, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff >+ fMTX_MULT 6, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff >+ fMTX_MULT 7, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff >+ >+ pop ebx > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/dct/x86_asm/idct_3dne.asm xvidcore-1.1.0-beta2/src/dct/x86_asm/idct_3dne.asm >--- xvidcore-1.1.0-beta2-old/src/dct/x86_asm/idct_3dne.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/dct/x86_asm/idct_3dne.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -223,6 +223,11 @@ tab_i_35_xmm: > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > cglobal idct_3dne > > ;----------------------------------------------------------------------------- >@@ -231,25 +236,29 @@ cglobal idct_3dne > > ALIGN 16 > idct_3dne: >- mov eax, [esp+4] >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, [esp+4+4] > > ; DCT_8_INV_ROW_1_s [eax+64], [eax+64], tab_i_04_sse, rounder_4 ;rounder_4=0 > pshufw mm0, [eax+64],10001000b ; x2 x0 x2 x0 >- movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > pshufw mm1, [eax+64+8],10001000b ; x6 x4 x6 x4 >- movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 >+ movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 > pshufw mm2, [eax+64],11011101b ; x3 x1 x3 x1 > pshufw mm5, [eax+64+8],11011101b ; x7 x5 x7 x5 >- movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 >- movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 ; >- pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 ; >+ pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) > pshufw mm1, [eax+80+8],10001000b ; x6 x4 x6 x4 >@@ -260,12 +269,12 @@ idct_3dne: > movq mm7, mm0 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 > paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >- movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > psubd mm7, mm2 ; ; a3-b3 a2-b2 > paddd mm0, mm2 ; 0 free a3+b3 a2+b2 > pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1 > pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 >- pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 > psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 > psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -276,19 +285,19 @@ idct_3dne: > movq [eax+64], mm6 ; 3 ; save y3 y2 y1 y0 stall2 > > ; DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5 >- movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 >- movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 >+ movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 >- paddd mm3, [rounder_5] ; +rounder stall 6 >- paddd mm5, [rounder_5] ; +rounder >+ paddd mm3, [ebx + rounder_5 wrt ..gotoff] ; +rounder stall 6 >+ paddd mm5, [ebx + rounder_5 wrt ..gotoff] ; +rounder > movq [eax+64+8], mm7 ; 7 ; save y7 y6 y5 y4 >- movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) > pshufw mm1, [eax+96+8],10001000b ; x6 x4 x6 x4 >@@ -299,12 +308,12 @@ idct_3dne: > movq mm7, mm5 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 > paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >- movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > psubd mm7, mm2 ; ; a3-b3 a2-b2 > paddd mm5, mm2 ; 0 free a3+b3 a2+b2 > pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1 > pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 >- pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 > psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 > psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -315,19 +324,19 @@ idct_3dne: > movq [eax+80], mm6 ; 3 ; save y3 y2 y1 y0 > > ; DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6 >- movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 >- movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 >+ movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 >- paddd mm3, [rounder_6] ; +rounder >- paddd mm0, [rounder_6] ; +rounder >+ paddd mm3, [ebx + rounder_6 wrt ..gotoff] ; +rounder >+ paddd mm0, [ebx + rounder_6 wrt ..gotoff] ; +rounder > movq [eax+80+8], mm7 ; 7 ; save y7 y6 >- movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) > pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4 >@@ -338,12 +347,12 @@ idct_3dne: > movq mm7, mm0 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 > paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >- movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > psubd mm7, mm2 ; ; a3-b3 a2-b2 > paddd mm0, mm2 ; 0 free a3+b3 a2+b2 > pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1 > pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 >- pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 > psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 > psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -354,19 +363,19 @@ idct_3dne: > movq [eax+96], mm6 ; 3 ; save y3 y2 y1 y0 stall2 > > ; DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7 >- movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 >- movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 >+ movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 >- paddd mm3, [rounder_7] ; +rounder stall 6 >- paddd mm5, [rounder_7] ; +rounder >+ paddd mm3, [ebx + rounder_7 wrt ..gotoff] ; +rounder stall 6 >+ paddd mm5, [ebx + rounder_7 wrt ..gotoff] ; +rounder > movq [eax+96+8], mm7 ; 7 ; save y7 y6 y5 y4 >- movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) > pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4 >@@ -377,12 +386,12 @@ idct_3dne: > movq mm7, mm5 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 > paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >- movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > psubd mm7, mm2 ; ; a3-b3 a2-b2 > paddd mm5, mm2 ; 0 free a3+b3 a2+b2 > pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1 > pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 >- pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 > psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 > psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -393,19 +402,19 @@ idct_3dne: > movq [eax+112], mm6 ; 3 ; save y3 y2 y1 y0 > > ; DCT_8_INV_ROW_1_s [eax+0], 0, tab_i_04_xmm, rounder_0 >- movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 >- movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 >+ movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 >- paddd mm3, [rounder_0] ; +rounder >- paddd mm0, [rounder_0] ; +rounder >+ paddd mm3, [ebx + rounder_0 wrt ..gotoff] ; +rounder >+ paddd mm0, [ebx + rounder_0 wrt ..gotoff] ; +rounder > movq [eax+112+8], mm7 ; 7 ; save y7 y6 >- movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm0, mm1 ; 1 > pshufw mm1, [eax+16+8],10001000b ; x6 x4 x6 x4 >@@ -416,12 +425,12 @@ idct_3dne: > movq mm7, mm0 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 > paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >- movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > psubd mm7, mm2 ; ; a3-b3 a2-b2 > paddd mm0, mm2 ; 0 free a3+b3 a2+b2 > pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1 > pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 >- pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 > psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 > psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -432,19 +441,19 @@ idct_3dne: > movq [eax+0], mm6 ; 3 ; save y3 y2 y1 y0 stall2 > > ; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1 >- movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 >- movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 >+ movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 >- paddd mm3, [rounder_1] ; +rounder stall 6 >- paddd mm5, [rounder_1] ; +rounder >+ paddd mm3, [ebx + rounder_1 wrt ..gotoff] ; +rounder stall 6 >+ paddd mm5, [ebx + rounder_1 wrt ..gotoff] ; +rounder > movq [eax+0+8], mm7 ; 7 ; save y7 y6 y5 y4 >- movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) > pshufw mm1, [eax+32+8],10001000b ; x6 x4 x6 x4 >@@ -455,12 +464,12 @@ idct_3dne: > movq mm7, mm5 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 > paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >- movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > psubd mm7, mm2 ; ; a3-b3 a2-b2 > paddd mm5, mm2 ; 0 free a3+b3 a2+b2 > pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1 > pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 >- pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 > psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 > psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -471,19 +480,19 @@ idct_3dne: > movq [eax+16], mm6 ; 3 ; save y3 y2 y1 y0 > > ; DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2 >- movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 >- movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 >+ movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 >- paddd mm3, [rounder_2] ; +rounder >- paddd mm0, [rounder_2] ; +rounder >+ paddd mm3, [ebx + rounder_2 wrt ..gotoff] ; +rounder >+ paddd mm0, [ebx + rounder_2 wrt ..gotoff] ; +rounder > movq [eax+16+8], mm7 ; 7 ; save y7 y6 >- movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) > pshufw mm1, [eax+48+8],10001000b ; x6 x4 x6 x4 >@@ -494,12 +503,12 @@ idct_3dne: > movq mm7, mm0 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 > paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >- movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 >+ movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 > psubd mm7, mm2 ; ; a3-b3 a2-b2 > paddd mm0, mm2 ; 0 free a3+b3 a2+b2 > pshufw mm2, [eax+48],11011101b; x3 x1 x3 x1 > pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 >- pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 > psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 > psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 > psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -510,26 +519,26 @@ idct_3dne: > movq [eax+32], mm6 ; 3 ; save y3 y2 y1 y0 stall2 > > ; DCT_8_INV_ROW_1_s [eax+48], [eax+48], tab_i_35_xmm, rounder_3 >- movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 >- movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 >+ movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 >+ movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 > pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 >- paddd mm3, [rounder_3] ; +rounder stall 6 >- paddd mm5, [rounder_3] ; +rounder >+ paddd mm3, [ebx + rounder_3 wrt ..gotoff] ; +rounder stall 6 >+ paddd mm5, [ebx + rounder_3 wrt ..gotoff] ; +rounder > movq [eax+32+8], mm7 ; 7 ; save y7 y6 y5 y4 >- movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 >- pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 > pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 >- pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) > paddd mm5, mm1 ; mm1 free ; a3=sum(even3) a2=sum(even2) >- movq mm1, [tg_3_16] >+ movq mm1, [ebx + tg_3_16 wrt ..gotoff] > movq mm4, mm3 ; 4 ; a1 a0 > paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) > paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) >- movq mm0, [tg_3_16] >+ movq mm0, [ebx + tg_3_16 wrt ..gotoff] > movq mm7, mm5 ; 7 ; a3 a2 > psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 > paddd mm3, mm6 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 >@@ -542,7 +551,7 @@ idct_3dne: > psrad mm2, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 > movq mm6, [eax+16*1] > packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 >- movq mm4, [tg_1_16] >+ movq mm4, [ebx + tg_1_16 wrt ..gotoff] > packssdw mm3, mm2 ; 0 free ; y3 y2 y1 y0 > pshufw mm2, mm7, 10110001b ; y7 y6 y5 y4 > >@@ -559,7 +568,7 @@ idct_3dne: > paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) > psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 > movq [eax+48], mm3 ; 3 ; save y3 y2 y1 y0 >- movq mm3, [ocos_4_16] >+ movq mm3, [ebx + ocos_4_16 wrt ..gotoff] > paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 > paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 > psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 >@@ -569,7 +578,7 @@ idct_3dne: > psubsw mm6, mm0 ; tm17-tm35 = b3 > psubsw mm4, mm1 ; tp17-tp35 = t1 > paddsw mm2, mm0 ; tm17+tm35 = t2 >- movq mm7, [tg_2_16] >+ movq mm7, [ebx + tg_2_16 wrt ..gotoff] > movq mm1, mm4 ; t1 > movq [eax+3*16], mm5 ; save b0 > paddsw mm1, mm2 ; t1+t2 >@@ -620,7 +629,7 @@ idct_3dne: > movq mm6, mm2 ; a3 > psraw mm4, SHIFT_INV_COL ; dst7 > movq [eax+5*16], mm0 >- movq mm0, [tg_3_16] >+ movq mm0, [ebx + tg_3_16 wrt ..gotoff] > paddsw mm2, mm3 ; a3+b3 > movq [eax+6*16], mm7 > psubsw mm6, mm3 ; a3-b3 >@@ -634,7 +643,7 @@ idct_3dne: > movq mm5, [eax+8+16*5] > psraw mm6, SHIFT_INV_COL ; dst4 > pmulhw mm0, mm3 ; x3*(tg_3_16-1) >- movq mm4, [tg_1_16] >+ movq mm4, [ebx + tg_1_16 wrt ..gotoff] > pmulhw mm1, mm5 ; x5*(tg_3_16-1) > movq mm7, [eax+8+16*7] > movq [eax+3*16], mm2 >@@ -646,7 +655,7 @@ idct_3dne: > pmulhw mm2, mm6 ; x1*tg_1_16 > paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) > psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 >- movq mm3, [ocos_4_16] >+ movq mm3, [ebx + ocos_4_16 wrt ..gotoff] > paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 > paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 > psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 >@@ -655,7 +664,7 @@ idct_3dne: > paddsw mm5, mm1 ; tp17+tp35 = b0 > psubsw mm4, mm1 ; tp17-tp35 = t1 > paddsw mm2, mm0 ; tm17+tm35 = t2 >- movq mm7, [tg_2_16] >+ movq mm7, [ebx + tg_2_16 wrt ..gotoff] > movq mm1, mm4 ; t1 > psubsw mm6, mm0 ; tm17-tm35 = b3 > movq [eax+8+3*16], mm5 ; save b0 >@@ -717,6 +726,9 @@ idct_3dne: > movq [eax+8+3*16], mm2 > movq [eax+8+4*16], mm6 > >+ pop ebx > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/dct/x86_asm/idct_mmx.asm xvidcore-1.1.0-beta2/src/dct/x86_asm/idct_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/dct/x86_asm/idct_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/dct/x86_asm/idct_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -326,25 +326,25 @@ tab_i_35_xmm: > punpcklwd mm0, mm1 ; x5 x1 x4 x0 > movq mm5, mm0 ; 5 ; x5 x1 x4 x0 > punpckldq mm0, mm0 ; x4 x0 x4 x0 >- movq mm4, [%3+8] ; 4 ; w07 w05 w03 w01 >+ movq mm4, [8+%3] ; 4 ; w07 w05 w03 w01 > punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 > pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 > movq mm6, mm2 ; 6 ; x7 x3 x6 x2 >- movq mm1, [%3+32] ; 1 ; w22 w20 w18 w16 >+ movq mm1, [32+%3] ; 1 ; w22 w20 w18 w16 > punpckldq mm2, mm2 ; x6 x2 x6 x2 > pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 > punpckhdq mm5, mm5 ; x5 x1 x5 x1 >- pmaddwd mm0, [%3+16] ; x4*w14+x0*w12 x4*w10+x0*w08 >+ pmaddwd mm0, [16+%3] ; x4*w14+x0*w12 x4*w10+x0*w08 > punpckhdq mm6, mm6 ; x7 x3 x7 x3 >- movq mm7, [%3+40] ; 7 ; w23 w21 w19 w17 >+ movq mm7, [40+%3] ; 7 ; w23 w21 w19 w17 > pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 > paddd mm3, [%4] ; +%4 > pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 >- pmaddwd mm2, [%3+24] ; x6*w15+x2*w13 x6*w11+x2*w09 >+ pmaddwd mm2, [24+%3] ; x6*w15+x2*w13 x6*w11+x2*w09 > paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) >- pmaddwd mm5, [%3+48] ; x5*w30+x1*w28 x5*w26+x1*w24 >+ pmaddwd mm5, [48+%3] ; x5*w30+x1*w28 x5*w26+x1*w24 > movq mm4, mm3 ; 4 ; a1 a0 >- pmaddwd mm6, [%3+56] ; x7*w31+x3*w29 x7*w27+x3*w25 >+ pmaddwd mm6, [56+%3] ; x7*w31+x3*w29 x7*w27+x3*w25 > paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) > paddd mm0, [%4] ; +%4 > psubd mm3, mm1 ; a1-b1 a0-b0 >@@ -378,25 +378,25 @@ tab_i_35_xmm: > movq mm2, mm0 ; 2 ; x3 x2 x1 x0 > movq mm3, [%3] ; 3 ; w05 w04 w01 w00 > pshufw mm0, mm0, 10001000b ; x2 x0 x2 x0 >- movq mm4, [%3+8] ; 4 ; w07 w06 w03 w02 >+ movq mm4, [8+%3] ; 4 ; w07 w06 w03 w02 > movq mm5, mm1 ; 5 ; x7 x6 x5 x4 > pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 >- movq mm6, [%3+32] ; 6 ; w21 w20 w17 w16 >+ movq mm6, [32+%3] ; 6 ; w21 w20 w17 w16 > pshufw mm1, mm1, 10001000b ; x6 x4 x6 x4 > pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 >- movq mm7, [%3+40] ; 7 ; w23 w22 w19 w18 >+ movq mm7, [40+%3] ; 7 ; w23 w22 w19 w18 > pshufw mm2, mm2, 11011101b ; x3 x1 x3 x1 > pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 > pshufw mm5, mm5, 11011101b ; x7 x5 x7 x5 > pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18 > paddd mm3, [%4] ; +%4 >- pmaddwd mm0, [%3+16] ; x2*w13+x0*w12 x2*w09+x0*w08 >+ pmaddwd mm0, [16+%3] ; x2*w13+x0*w12 x2*w09+x0*w08 > paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) >- pmaddwd mm1, [%3+24] ; x6*w15+x4*w14 x6*w11+x4*w10 >+ pmaddwd mm1, [24+%3] ; x6*w15+x4*w14 x6*w11+x4*w10 > movq mm4, mm3 ; 4 ; a1 a0 >- pmaddwd mm2, [%3+48] ; x3*w29+x1*w28 x3*w25+x1*w24 >+ pmaddwd mm2, [48+%3] ; x3*w29+x1*w28 x3*w25+x1*w24 > paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) >- pmaddwd mm5, [%3+56] ; x7*w31+x5*w30 x7*w27+x5*w26 >+ pmaddwd mm5, [56+%3] ; x7*w31+x5*w30 x7*w27+x5*w26 > paddd mm3, mm6 ; a1+b1 a0+b0 > paddd mm0, [%4] ; +%4 > psrad mm3, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 >@@ -480,12 +480,12 @@ tab_i_35_xmm: > ;----------------------------------------------------------------------------- > > %macro DCT_8_INV_COL 2 >- movq mm0, [tg_3_16] >+ movq mm0, [ebx + tg_3_16 wrt ..gotoff] > movq mm3, [%1+16*3] > movq mm1, mm0 ; tg_3_16 > movq mm5, [%1+16*5] > pmulhw mm0, mm3 ; x3*(tg_3_16-1) >- movq mm4, [tg_1_16] >+ movq mm4, [ebx + tg_1_16 wrt ..gotoff] > pmulhw mm1, mm5 ; x5*(tg_3_16-1) > movq mm7, [%1+16*7] > movq mm2, mm4 ; tg_1_16 >@@ -495,7 +495,7 @@ tab_i_35_xmm: > pmulhw mm2, mm6 ; x1*tg_1_16 > paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) > psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 >- movq mm3, [ocos_4_16] >+ movq mm3, [ebx + ocos_4_16 wrt ..gotoff] > paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 > paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 > psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 >@@ -505,7 +505,7 @@ tab_i_35_xmm: > psubsw mm6, mm0 ; tm17-tm35 = b3 > psubsw mm4, mm1 ; tp17-tp35 = t1 > paddsw mm2, mm0 ; tm17+tm35 = t2 >- movq mm7, [tg_2_16] >+ movq mm7, [ebx + tg_2_16 wrt ..gotoff] > movq mm1, mm4 ; t1 > ; movq [SCRATCH+0], mm5 ; save b0 > movq [%2+3*16], mm5 ; save b0 >@@ -577,6 +577,11 @@ tab_i_35_xmm: > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > cglobal idct_mmx > cglobal idct_xmm > >@@ -586,22 +591,27 @@ cglobal idct_xmm > > ALIGN 16 > idct_mmx: >- mov eax, dword [esp + 4] >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, dword [esp + 4 + 4] > > ;; Process each row >- DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, tab_i_04_mmx, rounder_0 >- DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, tab_i_17_mmx, rounder_1 >- DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, tab_i_26_mmx, rounder_2 >- DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, tab_i_35_mmx, rounder_3 >- DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, tab_i_04_mmx, rounder_4 >- DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, tab_i_35_mmx, rounder_5 >- DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, tab_i_26_mmx, rounder_6 >- DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, tab_i_17_mmx, rounder_7 >+ DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_0 wrt ..gotoff >+ DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_1 wrt ..gotoff >+ DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_2 wrt ..gotoff >+ DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_3 wrt ..gotoff >+ DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_4 wrt ..gotoff >+ DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_5 wrt ..gotoff >+ DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_6 wrt ..gotoff >+ DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_7 wrt ..gotoff > > ;; Process the columns (4 at a time) > DCT_8_INV_COL eax+0, eax+0 > DCT_8_INV_COL eax+8, eax+8 > >+ pop ebx > ret > .endfunc > >@@ -611,22 +621,29 @@ idct_mmx: > > ALIGN 16 > idct_xmm: >- mov eax, dword [esp + 4] >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, dword [esp + 4 + 4] > > ;; Process each row >- DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, tab_i_04_xmm, rounder_0 >- DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, tab_i_17_xmm, rounder_1 >- DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, tab_i_26_xmm, rounder_2 >- DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, tab_i_35_xmm, rounder_3 >- DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, tab_i_04_xmm, rounder_4 >- DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, tab_i_35_xmm, rounder_5 >- DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, tab_i_26_xmm, rounder_6 >- DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, tab_i_17_xmm, rounder_7 >+ DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_0 wrt ..gotoff >+ DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_1 wrt ..gotoff >+ DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_2 wrt ..gotoff >+ DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_3 wrt ..gotoff >+ DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_4 wrt ..gotoff >+ DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_5 wrt ..gotoff >+ DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_6 wrt ..gotoff >+ DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_7 wrt ..gotoff > > ;; Process the columns (4 at a time) > DCT_8_INV_COL eax+0, eax+0 > DCT_8_INV_COL eax+8, eax+8 > >+ pop ebx > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/dct/x86_asm/idct_sse2_dmitry.asm xvidcore-1.1.0-beta2/src/dct/x86_asm/idct_sse2_dmitry.asm >--- xvidcore-1.1.0-beta2-old/src/dct/x86_asm/idct_sse2_dmitry.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/dct/x86_asm/idct_sse2_dmitry.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -183,7 +183,7 @@ cglobal idct_sse2_dmitry > > ;a 3210 first part > pshufd xmm2, xmm1, 10101010b ;x 64646464 >- pmaddwd xmm2, [%3+16] ;w 15 14 11 10 7632 >+ pmaddwd xmm2, [16+%3] ;w 15 14 11 10 7632 > > ;a 3210 second part > paddd xmm2, xmm0 ;a 3210 ready >@@ -191,11 +191,11 @@ cglobal idct_sse2_dmitry > movdqa xmm5, xmm2 > > pshufd xmm3, xmm1, 01010101b ;x 31313131 >- pmaddwd xmm3, [%3+32] ;w 29 28 25 24 21 20 17 16 >+ pmaddwd xmm3, [32+%3] ;w 29 28 25 24 21 20 17 16 > > ;b 3210 first part > pshufd xmm4, xmm1, 11111111b ;x 75757575 >- pmaddwd xmm4, [%3+48] ;w 31 30 27 26 23 22 19 18 >+ pmaddwd xmm4, [48+%3] ;w 31 30 27 26 23 22 19 18 > > ;b 3210 second part > paddd xmm3,xmm4 ;b 3210 ready >@@ -220,7 +220,7 @@ cglobal idct_sse2_dmitry > > movdqa xmm4, [%1+16*2] ;x2 > movdqa xmm5, [%1+16*6] ;x6 >- movdqa xmm6, [tg_2_16] >+ movdqa xmm6, [ebx + tg_2_16 wrt ..gotoff] > movdqa xmm7, xmm6 > > paddsw xmm0, xmm2 ;u04=x0+x4 >@@ -245,12 +245,12 @@ cglobal idct_sse2_dmitry > > movdqa xmm0, [%1+16*1] ;x1 > movdqa xmm1, [%1+16*7] ;x7 >- movdqa xmm2, [tg_1_16] >+ movdqa xmm2, [ebx + tg_1_16 wrt ..gotoff] > movdqa xmm3, xmm2 > > movdqa xmm4, [%1+16*3] ;x3 > movdqa xmm5, [%1+16*5] ;x5 >- movdqa xmm6, [tg_3_16] >+ movdqa xmm6, [ebx + tg_3_16 wrt ..gotoff] > movdqa xmm7, xmm6 > > pmulhw xmm2, xmm0 >@@ -267,7 +267,7 @@ cglobal idct_sse2_dmitry > psubsw xmm6, xmm5 ;v35=x3*T3-x5 > paddsw xmm7, xmm4 ;u35=x5*T3+x3 > >- movdqa xmm4, [ocos_4_16] >+ movdqa xmm4, [ebx + ocos_4_16 wrt ..gotoff] > > paddsw xmm0, xmm7 ;b0=u17+u35 > psubsw xmm1, xmm6 ;b3=v17-v35 >@@ -322,26 +322,37 @@ cglobal idct_sse2_dmitry > movdqa [%2+16*5], xmm7 > %endmacro > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; void idct_sse2_dmitry(int16_t coeff[64]); > ;----------------------------------------------------------------------------- > > ALIGN 16 > idct_sse2_dmitry: >- >- mov eax, [esp + 4] >- >- DCT_8_INV_ROW_1_SSE2 eax+ 0, eax+ 0, tab_i_04, rounder_2_0 >- DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, tab_i_17, rounder_2_1 >- DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, tab_i_26, rounder_2_2 >- DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, tab_i_35, rounder_2_3 >- DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, tab_i_04, rounder_2_4 >- DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, tab_i_35, rounder_2_5 >- DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, tab_i_26, rounder_2_6 >- DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, tab_i_17, rounder_2_7 >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, [esp + 4 + 4] >+ >+ DCT_8_INV_ROW_1_SSE2 eax+ 0, eax+ 0, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_0 wrt ..gotoff >+ DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_1 wrt ..gotoff >+ DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_2 wrt ..gotoff >+ DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_3 wrt ..gotoff >+ DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_4 wrt ..gotoff >+ DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_5 wrt ..gotoff >+ DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_6 wrt ..gotoff >+ DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_7 wrt ..gotoff > > DCT_8_INV_COL_4_SSE2 eax, eax > >+ pop ebx > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/dct/x86_asm/simple_idct_mmx.asm xvidcore-1.1.0-beta2/src/dct/x86_asm/simple_idct_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/dct/x86_asm/simple_idct_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/dct/x86_asm/simple_idct_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -122,7 +122,7 @@ coeffs: > movq mm1,[src4] ; R6 R2 r6 r2 > movq mm2,[src1] ; R3 R1 r3 r1 > movq mm3,[src5] ; R7 R5 r7 r5 >- movq mm4,[wm1010] >+ movq mm4,[ebx + wm1010 wrt ..gotoff] > pand mm4,mm0 > por mm4,mm1 > por mm4,mm2 >@@ -131,29 +131,29 @@ coeffs: > movd eax,mm4 > or eax,eax > jz near .skip1 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm5,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 >- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 >+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 > pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 >- movq mm7,[coeffs+48] ; C3 C1 C3 C1 >+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 > pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 > rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 > paddd mm4,mm5 ; A0 a0 > psubd mm6,mm5 ; A3 a3 >- movq mm5,[coeffs+56] ; C7 C5 C7 C5 >+ movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 > pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 > rounder_op mm0, rounder_arg > paddd mm1,mm0 ; A1 a1 > paddd mm0,mm0 > psubd mm0,mm1 ; A2 a2 >- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 >+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 > paddd mm7,mm5 ; B0 b0 >- movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 >+ movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 > pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 > paddd mm7,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -170,14 +170,14 @@ coeffs: > packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 > movq [dst],mm7 > movq mm1,[src1] ; R3 R1 r3 r1 >- movq mm4,[coeffs+80] ;-C1 C5 -C1 C5 >+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ;-C1 C5 -C1 C5 > movq [dst + 24],mm2 > pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 >- movq mm7,[coeffs+88] ; C3 C7 C3 C7 >- pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 >+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 >+ pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 > pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 > movq mm2,mm0 ; A2 a2 >- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 >+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 > paddd mm4,mm7 ; B2 b2 > paddd mm2,mm4 ; A2+B2 a2+b2 > psubd mm0,mm4 ; a2-B2 a2-b2 >@@ -196,7 +196,7 @@ coeffs: > jmp short .skip2 > .skip1 > pslld mm0,16 >- paddd mm0,[d40000] >+ paddd mm0,[ebx + d40000 wrt ..gotoff] > psrad mm0,13 > packssdw mm0,mm0 > movq [ dst ],mm0 >@@ -240,29 +240,29 @@ coeffs: > movd eax,mm4 > or eax,eax > jz near bt >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm5,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 >- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 >+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 > pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 >- movq mm7,[coeffs+48] ; C3 C1 C3 C1 >+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 > pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 > rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 > paddd mm4,mm5 ; A0 a0 > psubd mm6,mm5 ; A3 a3 >- movq mm5,[coeffs+56] ; C7 C5 C7 C5 >+ movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 > pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 > rounder_op mm0, rounder_arg > paddd mm1,mm0 ; A1 a1 > paddd mm0,mm0 > psubd mm0,mm1 ; A2 a2 >- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 >+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 > paddd mm7,mm5 ; B0 b0 >- movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 >+ movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 > pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 > paddd mm7,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -279,14 +279,14 @@ coeffs: > packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 > movq [ dst ],mm7 > movq mm1,[src1] ; R3 R1 r3 r1 >- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 >+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 > movq [ dst + 24 ],mm2 > pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 >- movq mm7,[coeffs+88] ; C3 C7 C3 C7 >- pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 >+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 >+ pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 > pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 > movq mm2,mm0 ; A2 a2 >- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 >+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 > paddd mm4,mm7 ; B2 b2 > paddd mm2,mm4 ; A2+B2 a2+b2 > psubd mm0,mm4 ; a2-B2 a2-b2 >@@ -330,17 +330,17 @@ coeffs: > movq mm1,[src4] ; R6 R2 r6 r2 > movq mm2,[src1] ; R3 R1 r3 r1 > movq mm3,[src5] ; R7 R5 r7 r5 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm5,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 >- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 >+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 > pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 > ; rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 >- movq mm7,[coeffs+48] ; C3 C1 C3 C1 >+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 > ; rounder_op mm0, rounder_arg > pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 > paddd mm4,mm5 ; A0 a0 >@@ -348,11 +348,11 @@ coeffs: > movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 > paddd mm0,mm1 ; A1 a1 > psubd mm5,mm1 ; A2 a2 >- movq mm1,[coeffs+56] ; C7 C5 C7 C5 >+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 > pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 >- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 >+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 > paddd mm7,mm1 ; B0 b0 >- movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 >+ movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 > pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 > paddd mm7,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -374,13 +374,13 @@ coeffs: > packssdw mm4,mm4 ; A0-B0 a0-b0 > movd [ dst + 112],mm4 > movq mm0,[src1] ; R3 R1 r3 r1 >- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 >+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 > pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 >- movq mm7,[coeffs+88] ; C3 C7 C3 C7 >- pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 >+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 >+ pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 > pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 > movq mm2,mm5 ; A2 a2 >- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 >+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 > paddd mm4,mm7 ; B2 b2 > paddd mm2,mm4 ; A2+B2 a2+b2 > psubd mm5,mm4 ; a2-B2 a2-b2 >@@ -426,13 +426,13 @@ coeffs: > movq mm0,[src0] ; R4 R0 r4 r0 > movq mm1,[src4] ; R6 R2 r6 r2 > movq mm3,[src5] ; R7 R5 r7 r5 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm5,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 >- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 >+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 > pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 > ; rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 >@@ -442,9 +442,9 @@ coeffs: > movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 > paddd mm0,mm1 ; A1 a1 > psubd mm5,mm1 ; A2 a2 >- movq mm1,[coeffs+56] ; C7 C5 C7 C5 >+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 > pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 >- movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 >+ movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 > pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 > paddd mm1,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -464,10 +464,10 @@ coeffs: > movd [ dst + 96 ],mm2 > packssdw mm4,mm4 ; A0-B0 a0-b0 > movd [ dst + 112 ],mm4 >- movq mm1,[coeffs+88] ; C3 C7 C3 C7 >+ movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 > pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 > movq mm2,mm5 ; A2 a2 >- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 >+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 > paddd mm2,mm1 ; A2+B2 a2+b2 > psubd mm5,mm1 ; a2-B2 a2-b2 > psrad mm2,shift >@@ -510,17 +510,17 @@ coeffs: > %define shift %8 > movq mm0,[src0] ; R4 R0 r4 r0 > movq mm3,[src5] ; R7 R5 r7 r5 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 > ; rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 > ; rounder_op mm0, rounder_arg > movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm1,[coeffs+56] ; C7 C5 C7 C5 >+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 > pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 >- movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 >+ movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 > pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 > paddd mm1,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -540,10 +540,10 @@ coeffs: > movd [ dst + 96 ],mm2 > packssdw mm4,mm4 ; A0-B0 a0-b0 > movd [ dst + 112 ],mm4 >- movq mm1,[coeffs+88] ; C3 C7 C3 C7 >+ movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 > pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 > movq mm2,mm5 ; A2 a2 >- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 >+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 > paddd mm2,mm1 ; A2+B2 a2+b2 > psubd mm5,mm1 ; a2-B2 a2-b2 > psrad mm2,shift >@@ -587,21 +587,21 @@ coeffs: > movq mm0,[src0] ; R4 R0 r4 r0 > movq mm2,[src1] ; R3 R1 r3 r1 > movq mm3,[src5] ; R7 R5 r7 r5 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 > ; rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 >- movq mm7,[coeffs+48] ; C3 C1 C3 C1 >+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 > ; rounder_op mm0, rounder_arg > pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 > movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm1,[coeffs+56] ; C7 C5 C7 C5 >+ movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 > pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 >- pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 >+ pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 > paddd mm7,mm1 ; B0 b0 >- movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 >+ movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 > pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 > paddd mm7,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -623,13 +623,13 @@ coeffs: > packssdw mm4,mm4 ; A0-B0 a0-b0 > movd [dst + 112],mm4 > movq mm0,[src1] ; R3 R1 r3 r1 >- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 >+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 > pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 >- movq mm7,[coeffs+88] ; C3 C7 C3 C7 >- pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 >+ movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 >+ pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 > pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 > movq mm2,mm5 ; A2 a2 >- pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 >+ pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 > paddd mm4,mm7 ; B2 b2 > paddd mm2,mm4 ; A2+B2 a2+b2 > psubd mm5,mm4 ; a2-B2 a2-b2 >@@ -674,17 +674,17 @@ coeffs: > %define shift %8 > movq mm0,[src0] ; R4 R0 r4 r0 > movq mm2,[src1] ; R3 R1 r3 r1 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 > ; rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 >- movq mm7,[coeffs+48] ; C3 C1 C3 C1 >+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 > ; rounder_op mm0, rounder_arg > pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 > movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm3,[coeffs+64] >+ movq mm3,[ebx + coeffs+64 wrt ..gotoff] > pmaddwd mm3,mm2 ; -C7R3+C3R1 -C7r3+C3r1 > paddd mm7,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -704,9 +704,9 @@ coeffs: > movd [dst + 96],mm1 > packssdw mm4,mm4 ; A0-B0 a0-b0 > movd [dst + 112],mm4 >- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 >+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 > pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 >- pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 >+ pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 > movq mm1,mm5 ; A2 a2 > paddd mm1,mm4 ; A2+B2 a2+b2 > psubd mm5,mm4 ; a2-B2 a2-b2 >@@ -750,13 +750,13 @@ coeffs: > %define shift %8 > movq mm0,[src0] ; R4 R0 r4 r0 > movq mm1,[src4] ; R6 R2 r6 r2 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm5,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 >- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 >+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 > pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 > ; rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 >@@ -768,13 +768,13 @@ coeffs: > psubd mm5,mm1 ; A2 a2 > movq mm2,[src0 + 8] ; R4 R0 r4 r0 > movq mm3,[src4 + 8] ; R6 R2 r6 r2 >- movq mm1,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 >- movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm7,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > pmaddwd mm7,mm3 ; C6R6+C2R2 C6r6+C2r2 >- pmaddwd mm3,[coeffs+40] ; -C2R6+C6R2 -C2r6+C6r2 >+ pmaddwd mm3,[ebx + coeffs+40 wrt ..gotoff] ; -C2R6+C6R2 -C2r6+C6r2 > ; rounder_op mm1, rounder_arg > paddd mm7,mm1 ; A0 a0 > paddd mm1,mm1 ; 2C0 2c0 >@@ -829,17 +829,17 @@ coeffs: > movq mm0,[src0] ; R4 R0 r4 r0 > movq mm1,[src4] ; R6 R2 r6 r2 > movq mm2,[src1] ; R3 R1 r3 r1 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm5,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 >- movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 >+ movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 > pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 > ; rounder_op mm4, rounder_arg > movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 >- movq mm7,[coeffs+48] ; C3 C1 C3 C1 >+ movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 > ; rounder_op mm0, rounder_arg > pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 > paddd mm4,mm5 ; A0 a0 >@@ -847,7 +847,7 @@ coeffs: > movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 > paddd mm0,mm1 ; A1 a1 > psubd mm5,mm1 ; A2 a2 >- movq mm1,[coeffs+64] >+ movq mm1,[ebx + coeffs+64 wrt ..gotoff] > pmaddwd mm1,mm2 ; -C7R3+C3R1 -C7r3+C3r1 > paddd mm7,mm4 ; A0+B0 a0+b0 > paddd mm4,mm4 ; 2A0 2a0 >@@ -867,9 +867,9 @@ coeffs: > movd [dst + 96],mm3 > packssdw mm4,mm4 ; A0-B0 a0-b0 > movd [dst + 112],mm4 >- movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 >+ movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 > pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 >- pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 >+ pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 > movq mm3,mm5 ; A2 a2 > paddd mm3,mm4 ; A2+B2 a2+b2 > psubd mm5,mm4 ; a2-B2 a2-b2 >@@ -912,20 +912,20 @@ coeffs: > %define rounder_arg %7 > %define shift %8 > movq mm0,[src0] ; R4 R0 r4 r0 >- movq mm4,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 >- movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 > ; rounder_op mm4, rounder_arg > ; rounder_op mm0, rounder_arg > psrad mm4,shift > psrad mm0,shift > movq mm2,[src0 + 8] ; R4 R0 r4 r0 >- movq mm1,[coeffs+16] ; C4 C4 C4 C4 >+ movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 > pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 >- movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 >+ movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 > pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 >- movq mm7,[coeffs+32] ; C6 C2 C6 C2 >+ movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 > ; rounder_op mm1, rounder_arg > ; rounder_op mm2, rounder_arg > psrad mm1,shift >@@ -1073,6 +1073,11 @@ coeffs: > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > cglobal simple_idct_mmx_P > cglobal simple_idct_mmx > >@@ -1083,14 +1088,18 @@ cglobal simple_idct_mmx > > ALIGN 16 > simple_idct_mmx_P: >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > sub esp, 128 >- mov edx, [esp+128+4] >+ mov edx, [esp+128+4+4] > > ; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt >- DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 >- Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four >- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .one >+ DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11 >+ Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .four >+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .two >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .one > IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1099,8 +1108,8 @@ simple_idct_mmx_P: > > ALIGN 16 > .four >- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five >+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .six >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .five > IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1109,7 +1118,7 @@ ALIGN 16 > > ALIGN 16 > .six >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .seven > IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1118,7 +1127,7 @@ ALIGN 16 > > ALIGN 16 > .two >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .three > IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1159,6 +1168,7 @@ ALIGN 16 > .ret > add esp, 128 > >+ pop ebx > ret > .endfunc > >@@ -1174,15 +1184,19 @@ ALIGN 16 > > ALIGN 16 > simple_idct_mmx: >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > sub esp, 128 >- mov edx, [esp+128+4] >+ mov edx, [esp+128+4+4] > PERMUTEP edx ; permute parm list in place > > ; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt >- DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 >- Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .fourP >- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .twoP >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .oneP >+ DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11 >+ Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .fourP >+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .twoP >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .oneP > IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1191,8 +1205,8 @@ simple_idct_mmx: > > ALIGN 16 > .fourP >- Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .sixP >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .fiveP >+ Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .sixP >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .fiveP > IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1201,7 +1215,7 @@ ALIGN 16 > > ALIGN 16 > .sixP >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .sevenP >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .sevenP > IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1210,7 +1224,7 @@ ALIGN 16 > > ALIGN 16 > .twoP >- Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .threeP >+ Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .threeP > IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 > IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 > IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 >@@ -1251,6 +1265,9 @@ ALIGN 16 > .retP > add esp, 128 > >+ pop ebx > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_mmx.inc xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_mmx.inc >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_mmx.inc 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_mmx.inc 2005-10-23 18:42:40.000000000 +0200 >@@ -56,11 +56,13 @@ NAME: > push edi ; esp + localsize + 4 > push ebp ; esp + localsize + 0 > >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > %define x_dif esp + localsize - 4 > %define y_dif esp + localsize - 8 > %define uv_dif esp + localsize - 12 > %define fixed_width esp + localsize - 16 >-%define tmp_height esp + localsize - 20 > > sub esp, localsize > >@@ -90,8 +92,6 @@ NAME: > mov esi, [y_ptr] ; $esi$ = y_ptr > mov edi, [x_ptr] ; $edi$ = x_ptr > mov edx, [x_stride] ; $edx$ = x_stride >- mov ebp, [height] ; $ebp$ = height >- > > mov ebx, [vflip] > or ebx, ebx >@@ -106,7 +106,7 @@ NAME: > sub ebx, edx > mov [x_dif], ebx ; x_dif = -BYTES*fixed_width - x_stride > >- mov eax, ebp >+ mov eax, [height] > sub eax, 1 > push edx > mul edx >@@ -126,8 +126,6 @@ NAME: > FUNC %+ _INIT ARG1, ARG2 ; call FUNC_INIT > > .y_loop >- mov [tmp_height], ebp >- mov ebp, [fixed_width] > > .x_loop > FUNC ARG1, ARG2 ; call FUNC >@@ -137,10 +135,9 @@ NAME: > add ebx, PIXELS/2 ; u_ptr += PIXELS/2 > add ecx, PIXELS/2 ; v_ptr += PIXELS/2 > >- sub ebp, PIXELS ; $ebp$ -= PIXELS >+ sub dword [fixed_width], PIXELS ; $ebp$ -= PIXELS > jg .x_loop ; if ($ebp$ > 0) goto .x_loop > >- mov ebp, [tmp_height] > add edi, [x_dif] ; x_ptr += x_dif + (VPIXELS-1)*x_stride > add esi, [y_dif] ; y_ptr += y_dif + (VPIXELS-1)*y_stride > %rep VPIXELS-1 >@@ -155,7 +152,7 @@ NAME: > add ecx, [uv_stride] > %endrep > >- sub ebp, VPIXELS ; $ebp$ -= VPIXELS >+ sub dword [height], VPIXELS ; $ebp$ -= VPIXELS > jg .y_loop ; if ($ebp$ > 0) goto .y_loop > > ; cleanup stack & undef everything >@@ -181,7 +178,6 @@ NAME: > %undef y_dif > %undef uv_dif > %undef fixed_width >-%undef tmp_height > ret > .endfunc > %undef NAME >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_rgb_mmx.asm xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_rgb_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_rgb_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_rgb_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -120,7 +120,7 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, > ;------------------------------------------------------------------------------ > > %macro BGR_TO_YV12_INIT 2 >- movq mm7, [y_mul] >+ movq mm7, [ebp + y_mul wrt ..gotoff] > %endmacro > > >@@ -184,8 +184,8 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, > > ; u_ptr, v_ptr > movq mm0, mm6 ; = [ |b4|g4|r4] >- pmaddwd mm6, [v_mul] ; *= V_MUL >- pmaddwd mm0, [u_mul] ; *= U_MUL >+ pmaddwd mm6, [ebp + v_mul wrt ..gotoff] ; *= V_MUL >+ pmaddwd mm0, [ebp + u_mul wrt ..gotoff] ; *= U_MUL > movq mm1, mm0 > movq mm2, mm6 > psrlq mm1, 32 >@@ -230,30 +230,30 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, > movd mm3, [ecx] ; v_ptr[0] > punpcklbw mm2, mm7 ; u3u2u1u0 -> mm2 > punpcklbw mm3, mm7 ; v3v2v1v0 -> mm3 >- psubsw mm2, [U_SUB] ; U - 128 >- psubsw mm3, [V_SUB] ; V - 128 >+ psubsw mm2, [ebp + U_SUB wrt ..gotoff] ; U - 128 >+ psubsw mm3, [ebp + V_SUB wrt ..gotoff] ; V - 128 > movq mm4, mm2 > movq mm5, mm3 >- pmullw mm2, [UG_MUL] >- pmullw mm3, [VG_MUL] >+ pmullw mm2, [ebp + UG_MUL wrt ..gotoff] >+ pmullw mm3, [ebp + VG_MUL wrt ..gotoff] > movq mm6, mm2 ; u3u2u1u0 -> mm6 > punpckhwd mm2, mm2 ; u3u3u2u2 -> mm2 > punpcklwd mm6, mm6 ; u1u1u0u0 -> mm6 >- pmullw mm4, [UB_MUL] ; B_ADD -> mm4 >+ pmullw mm4, [ebp + UB_MUL wrt ..gotoff] ; B_ADD -> mm4 > movq mm0, mm3 > punpckhwd mm3, mm3 ; v3v3v2v2 -> mm2 > punpcklwd mm0, mm0 ; v1v1v0v0 -> mm6 > paddsw mm2, mm3 > paddsw mm6, mm0 >- pmullw mm5, [VR_MUL] ; R_ADD -> mm5 >+ pmullw mm5, [ebp + VR_MUL wrt ..gotoff] ; R_ADD -> mm5 > movq mm0, [esi] ; y7y6y5y4y3y2y1y0 -> mm0 > movq mm1, mm0 > punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1 > punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0 >- psubsw mm0, [Y_SUB] ; Y - Y_SUB >- psubsw mm1, [Y_SUB] ; Y - Y_SUB >- pmullw mm1, [Y_MUL] >- pmullw mm0, [Y_MUL] >+ psubsw mm0, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB >+ psubsw mm1, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB >+ pmullw mm1, [ebp + Y_MUL wrt ..gotoff] >+ pmullw mm0, [ebp + Y_MUL wrt ..gotoff] > movq [TEMP_Y2], mm1 ; y7y6y5y4 -> mm3 > movq [TEMP_Y1], mm0 ; y3y2y1y0 -> mm7 > psubsw mm1, mm2 ; g7g6g5g4 -> mm1 >@@ -266,10 +266,10 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, > movq mm1, mm0 > punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1 > punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0 >- psubsw mm0, [Y_SUB] ; Y - Y_SUB >- psubsw mm1, [Y_SUB] ; Y - Y_SUB >- pmullw mm1, [Y_MUL] >- pmullw mm0, [Y_MUL] >+ psubsw mm0, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB >+ psubsw mm1, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB >+ pmullw mm1, [ebp + Y_MUL wrt ..gotoff] >+ pmullw mm0, [ebp + Y_MUL wrt ..gotoff] > movq mm3, mm1 > psubsw mm1, mm2 ; g7g6g5g4 -> mm1 > movq mm2, mm0 >@@ -419,6 +419,11 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bp: >+ mov ebp, [esp] >+ retn >+ > %include "colorspace_mmx.inc" > > ; input >@@ -429,3 +434,5 @@ MAKE_COLORSPACE bgra_to_yv12_mmx,0, 4 > MAKE_COLORSPACE yv12_to_bgr_mmx,48, 3,8,2, YV12_TO_BGR, 3, -1 > MAKE_COLORSPACE yv12_to_bgra_mmx,48, 4,8,2, YV12_TO_BGR, 4, -1 > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_yuv_mmx.asm xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_yuv_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_yuv_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_yuv_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -279,3 +279,6 @@ SECTION .text > MAKE_YV12_TO_YV12 yv12_to_yv12_mmx, 0 > > MAKE_YV12_TO_YV12 yv12_to_yv12_xmm, 1 >+ >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_yuyv_mmx.asm xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_yuyv_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/colorspace_yuyv_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/colorspace_yuyv_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -76,7 +76,7 @@ mmx_one: dw 1, 1, 1, 1 > ;----------------------------------------------------------------------------- > > %macro YUYV_TO_YV12_INIT 2 >- movq mm7, [yuyv_mask] >+ movq mm7, [ebp + yuyv_mask wrt ..gotoff] > %endmacro > > >@@ -108,8 +108,8 @@ mmx_one: dw 1, 1, 1, 1 > pand mm5, mm7 > pand mm6, mm7 > paddw mm5, mm6 >- paddw mm4, [mmx_one] ; +1 rounding >- paddw mm5, [mmx_one] ; >+ paddw mm4, [ebp + mmx_one wrt ..gotoff] ; +1 rounding >+ paddw mm5, [ebp + mmx_one wrt ..gotoff] ; > psrlw mm4, 1 > psrlw mm5, 1 > ;---[ 3dnow/xmm ]---------------------------------------------------- >@@ -310,6 +310,11 @@ mmx_one: dw 1, 1, 1, 1 > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bp: >+ mov ebp, [esp] >+ retn >+ > %include "colorspace_mmx.inc" > > ; input >@@ -329,3 +334,6 @@ MAKE_COLORSPACE yv12_to_uyvy_mmx,0, > > MAKE_COLORSPACE yv12_to_yuyvi_mmx,0, 2,8,4, YV12_TO_YUYVI, 0, -1 > MAKE_COLORSPACE yv12_to_uyvyi_mmx,0, 2,8,4, YV12_TO_YUYVI, 1, -1 >+ >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_3dn.asm xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_3dn.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_3dn.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_3dn.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -44,20 +44,6 @@ BITS 32 > %endmacro > > ;============================================================================= >-; Read Only data >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >-mmx_one: >- times 8 db 1 >- >-;============================================================================= > ; Code > ;============================================================================= > >@@ -128,7 +114,10 @@ interpolate8x8_halfpel_h_3dn: > > .rounding1 > ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > COPY_H_3DN_RND1 > lea ecx, [ecx+2*edx] > COPY_H_3DN_RND1 >@@ -202,7 +191,10 @@ interpolate8x8_halfpel_v_3dn: > > .rounding1 > ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > movq mm2, [eax] ; loop invariant > add eax, edx > >@@ -325,7 +317,10 @@ interpolate8x8_halfpel_hv_3dn > mov eax, [esp+ 8] ; Src > mov edx, [esp+12] ; stride > >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > > ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j > movq mm2, [eax] >@@ -356,3 +351,5 @@ interpolate8x8_halfpel_hv_3dn > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_3dne.asm xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_3dne.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_3dne.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_3dne.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -45,24 +45,6 @@ BITS 32 > %endmacro > > ;============================================================================= >-; Read only data >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >-mmx_one: >- times 8 db 1 >- >-ALIGN 8 >-mm_minusone: >- dd -1,-1 >- >-;============================================================================= > ; Macros > ;============================================================================= > >@@ -145,7 +127,10 @@ interpolate8x8_halfpel_h_3dne: > .rounding1 > ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 > mov ecx, [esp+ 4] ; Dst >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > COPY_H_SSE_RND1 > lea ecx, [ecx+2*edx] > COPY_H_SSE_RND1 >@@ -219,15 +204,15 @@ ALIGN 8 > psubusb mm0, [eax] > add eax, edx > mov ecx, [esp+ 4] ; Dst >- push esi >+ push byte -1 >+ push byte -1 > pcmpeqb mm1, mm1 > pcmpeqb mm2, mm2 >- mov esi, mm_minusone > psubusb mm1, [byte eax] > psubusb mm2, [eax+edx] > lea eax, [eax+2*edx] >- movq mm6, [esi] >- movq mm7, [esi] >+ movq mm6, [esp] >+ movq mm7, [esp] > pavgb mm0, mm1 > pavgb mm1, mm2 > psubusb mm6, mm0 >@@ -242,8 +227,8 @@ ALIGN 8 > lea eax, [eax+2*edx] > pavgb mm2, mm3 > pavgb mm3, mm4 >- movq mm0, [esi] >- movq mm1, [esi] >+ movq mm0, [esp] >+ movq mm1, [esp] > psubusb mm0, mm2 > psubusb mm1, mm3 > movq [ecx], mm0 >@@ -257,8 +242,8 @@ ALIGN 8 > lea eax, [eax+2*edx] > pavgb mm4, mm5 > pavgb mm5, mm6 >- movq mm2, [esi] >- movq mm3, [esi] >+ movq mm2, [esp] >+ movq mm3, [esp] > psubusb mm2, mm4 > psubusb mm3, mm5 > movq [ecx], mm2 >@@ -270,10 +255,10 @@ ALIGN 8 > psubusb mm0, [eax+edx] > pavgb mm6, mm7 > pavgb mm7, mm0 >- movq mm4, [esi] >- movq mm5, [esi] >+ movq mm4, [esp] >+ movq mm5, [esp] > psubusb mm4, mm6 >- pop esi >+ add esp, byte 8 > psubusb mm5, mm7 > movq [ecx], mm4 > movq [ecx+edx], mm5 >@@ -387,7 +372,10 @@ interpolate8x8_halfpel_hv_3dne: > pavgb mm2, mm3 > pxor mm3, mm6 ; mm2/mm3 ready > mov ecx, [esp+ 4] ; Dst >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > > jz near .rounding1 > lea ebp,[byte ebp] >@@ -412,3 +400,5 @@ ALIGN 16 > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_mmx.asm xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -162,13 +162,17 @@ interpolate8x8_halfpel_h_mmx: > > push esi > push edi >- mov eax, [esp + 8 + 16] ; rounding >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc > >- movq mm7, [rounding1_mmx + eax * 8] >+ mov eax, [esp + 12 + 16] ; rounding > >- mov edi, [esp + 8 + 4] ; dst >- mov esi, [esp + 8 + 8] ; src >- mov edx, [esp + 8 + 12] ; stride >+ movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff] >+ >+ mov edi, [esp + 12 + 4] ; dst >+ mov esi, [esp + 12 + 8] ; src >+ mov edx, [esp + 12 + 12] ; stride > > pxor mm6, mm6 ; zero > >@@ -181,6 +185,7 @@ interpolate8x8_halfpel_h_mmx: > COPY_H_MMX > COPY_H_MMX > >+ pop ebp > pop edi > pop esi > >@@ -221,13 +226,17 @@ interpolate8x8_halfpel_v_mmx: > push esi > push edi > >- mov eax, [esp + 8 + 16] ; rounding >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, [esp + 12 + 16] ; rounding > >- movq mm7, [rounding1_mmx + eax * 8] >+ movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff] > >- mov edi, [esp + 8 + 4] ; dst >- mov esi, [esp + 8 + 8] ; src >- mov edx, [esp + 8 + 12] ; stride >+ mov edi, [esp + 12 + 4] ; dst >+ mov esi, [esp + 12 + 8] ; src >+ mov edx, [esp + 12 + 12] ; stride > > pxor mm6, mm6 ; zero > >@@ -241,6 +250,7 @@ interpolate8x8_halfpel_v_mmx: > COPY_V_MMX > COPY_V_MMX > >+ pop ebp > pop edi > pop esi > >@@ -311,18 +321,22 @@ interpolate8x8_halfpel_hv_mmx: > push esi > push edi > >- mov eax, [esp + 8 + 16] ; rounding >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc > >- movq mm7, [rounding2_mmx + eax * 8] >+ mov eax, [esp + 12 + 16] ; rounding > >- mov edi, [esp + 8 + 4] ; dst >- mov esi, [esp + 8 + 8] ; src >+ movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff] >+ >+ mov edi, [esp + 12 + 4] ; dst >+ mov esi, [esp + 12 + 8] ; src > > mov eax, 8 > > pxor mm6, mm6 ; zero > >- mov edx, [esp + 8 + 12] ; stride >+ mov edx, [esp + 12 + 12] ; stride > > COPY_HV_MMX > COPY_HV_MMX >@@ -333,6 +347,7 @@ interpolate8x8_halfpel_hv_mmx: > COPY_HV_MMX > COPY_HV_MMX > >+ pop ebp > pop edi > pop esi > >@@ -373,10 +388,10 @@ interpolate8x8_halfpel_hv_mmx: > > por mm3, mm6 > >- pand mm0, [mmx_mask] >- pand mm1, [mmx_mask] >- pand mm4, [mmx_mask] >- pand mm5, [mmx_mask] >+ pand mm0, [ebp + mmx_mask wrt ..gotoff] >+ pand mm1, [ebp + mmx_mask wrt ..gotoff] >+ pand mm4, [ebp + mmx_mask wrt ..gotoff] >+ pand mm5, [ebp + mmx_mask wrt ..gotoff] > > psrlq mm0, 1 ; src1 / 2 > psrlq mm1, 1 ; src2 / 2 >@@ -420,10 +435,10 @@ interpolate8x8_halfpel_hv_mmx: > > pand mm3, mm6 > >- pand mm0, [mmx_mask] >- pand mm1, [mmx_mask] >- pand mm4, [mmx_mask] >- pand mm5, [mmx_mask] >+ pand mm0, [ebp + mmx_mask wrt ..gotoff] >+ pand mm1, [ebp + mmx_mask wrt ..gotoff] >+ pand mm4, [ebp + mmx_mask wrt ..gotoff] >+ pand mm5, [ebp + mmx_mask wrt ..gotoff] > > psrlq mm0, 1 ; src1 / 2 > psrlq mm1, 1 ; src2 / 2 >@@ -449,21 +464,25 @@ interpolate8x8_avg2_mmx: > > push ebx > >- mov eax, [esp + 4 + 20] ; rounding >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, [esp + 8 + 20] ; rounding > test eax, eax > > jnz near .rounding1 > >- mov eax, [esp + 4 + 24] ; height -> eax >+ mov eax, [esp + 8 + 24] ; height -> eax > sub eax, 8 > test eax, eax > >- mov ecx, [esp + 4 + 4] ; dst -> edi >- mov eax, [esp + 4 + 8] ; src1 -> esi >- mov ebx, [esp + 4 + 12] ; src2 -> eax >- mov edx, [esp + 4 + 16] ; stride -> edx >+ mov ecx, [esp + 8 + 4] ; dst -> edi >+ mov eax, [esp + 8 + 8] ; src1 -> esi >+ mov ebx, [esp + 8 + 12] ; src2 -> eax >+ mov edx, [esp + 8 + 16] ; stride -> edx > >- movq mm7, [mmx_one] >+ movq mm7, [ebp + mmx_one wrt ..gotoff] > > jz near .start0 > >@@ -484,16 +503,16 @@ interpolate8x8_avg2_mmx: > ret > > .rounding1 >- mov eax, [esp + 4 + 24] ; height -> eax >+ mov eax, [esp + 8 + 24] ; height -> eax > sub eax, 8 > test eax, eax > >- mov ecx, [esp + 4 + 4] ; dst -> edi >- mov eax, [esp + 4 + 8] ; src1 -> esi >- mov ebx, [esp + 4 + 12] ; src2 -> eax >- mov edx, [esp + 4 + 16] ; stride -> edx >+ mov ecx, [esp + 8 + 4] ; dst -> edi >+ mov eax, [esp + 8 + 8] ; src1 -> esi >+ mov ebx, [esp + 8 + 12] ; src2 -> eax >+ mov edx, [esp + 8 + 16] ; stride -> edx > >- movq mm7, [mmx_one] >+ movq mm7, [ebp + mmx_one wrt ..gotoff] > > jz near .start1 > >@@ -510,6 +529,7 @@ interpolate8x8_avg2_mmx: > lea ecx, [ecx+2*edx] > AVG2_MMX_RND1 > >+ pop ebp > pop ebx > ret > .endfunc >@@ -534,11 +554,11 @@ interpolate8x8_avg2_mmx: > movq mm2, mm0 > movq mm3, mm1 > >- pand mm2, [mmx_three] >- pand mm3, [mmx_three] >+ pand mm2, [ebp + mmx_three wrt ..gotoff] >+ pand mm3, [ebp + mmx_three wrt ..gotoff] > >- pand mm0, [mmx_mask2] >- pand mm1, [mmx_mask2] >+ pand mm0, [ebp + mmx_mask2 wrt ..gotoff] >+ pand mm1, [ebp + mmx_mask2 wrt ..gotoff] > > psrlq mm0, 2 > psrlq mm1, 2 >@@ -555,11 +575,11 @@ interpolate8x8_avg2_mmx: > movq mm1, mm4 > movq mm3, mm5 > >- pand mm1, [mmx_three] >- pand mm3, [mmx_three] >+ pand mm1, [ebp + mmx_three wrt ..gotoff] >+ pand mm3, [ebp + mmx_three wrt ..gotoff] > >- pand mm4, [mmx_mask2] >- pand mm5, [mmx_mask2] >+ pand mm4, [ebp + mmx_mask2 wrt ..gotoff] >+ pand mm5, [ebp + mmx_mask2 wrt ..gotoff] > > psrlq mm4, 2 > psrlq mm5, 2 >@@ -570,8 +590,8 @@ interpolate8x8_avg2_mmx: > paddb mm1, mm3 > paddb mm2, mm1 > >- paddb mm2, [mmx_two] >- pand mm2, [mmx_mask2] >+ paddb mm2, [ebp + mmx_two wrt ..gotoff] >+ pand mm2, [ebp + mmx_mask2 wrt ..gotoff] > > psrlq mm2, 2 > paddb mm0, mm2 >@@ -589,11 +609,11 @@ interpolate8x8_avg2_mmx: > movq mm2, mm0 > movq mm3, mm1 > >- pand mm2, [mmx_three] >- pand mm3, [mmx_three] >+ pand mm2, [ebp + mmx_three wrt ..gotoff] >+ pand mm3, [ebp + mmx_three wrt ..gotoff] > >- pand mm0, [mmx_mask2] >- pand mm1, [mmx_mask2] >+ pand mm0, [ebp + mmx_mask2 wrt ..gotoff] >+ pand mm1, [ebp + mmx_mask2 wrt ..gotoff] > > psrlq mm0, 2 > psrlq mm1, 2 >@@ -610,11 +630,11 @@ interpolate8x8_avg2_mmx: > movq mm1, mm4 > movq mm3, mm5 > >- pand mm1, [mmx_three] >- pand mm3, [mmx_three] >+ pand mm1, [ebp + mmx_three wrt ..gotoff] >+ pand mm3, [ebp + mmx_three wrt ..gotoff] > >- pand mm4, [mmx_mask2] >- pand mm5, [mmx_mask2] >+ pand mm4, [ebp + mmx_mask2 wrt ..gotoff] >+ pand mm5, [ebp + mmx_mask2 wrt ..gotoff] > > psrlq mm4, 2 > psrlq mm5, 2 >@@ -625,8 +645,8 @@ interpolate8x8_avg2_mmx: > paddb mm1, mm3 > paddb mm2, mm1 > >- paddb mm2, [mmx_one] >- pand mm2, [mmx_mask2] >+ paddb mm2, [ebp + mmx_one wrt ..gotoff] >+ pand mm2, [ebp + mmx_mask2 wrt ..gotoff] > > psrlq mm2, 2 > paddb mm0, mm2 >@@ -644,18 +664,22 @@ interpolate8x8_avg4_mmx: > push edi > push esi > >- mov eax, [esp + 12 + 28] ; rounding >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, [esp + 16 + 28] ; rounding > > test eax, eax > >- mov ecx, [esp + 12 + 4] ; dst -> edi >- mov eax, [esp + 12 + 8] ; src1 -> esi >- mov ebx, [esp + 12 + 12] ; src2 -> eax >- mov esi, [esp + 12 + 16] ; src3 -> esi >- mov edi, [esp + 12 + 20] ; src4 -> edi >- mov edx, [esp + 12 + 24] ; stride -> edx >+ mov ecx, [esp + 16 + 4] ; dst -> edi >+ mov eax, [esp + 16 + 8] ; src1 -> esi >+ mov ebx, [esp + 16 + 12] ; src2 -> eax >+ mov esi, [esp + 16 + 16] ; src3 -> esi >+ mov edi, [esp + 16 + 20] ; src4 -> edi >+ mov edx, [esp + 16 + 24] ; stride -> edx > >- movq mm7, [mmx_one] >+ movq mm7, [ebp + mmx_one wrt ..gotoff] > > jnz near .rounding1 > >@@ -697,6 +721,7 @@ interpolate8x8_avg4_mmx: > lea ecx, [ecx+edx] > AVG4_MMX_RND1 > >+ pop ebp > pop esi > pop edi > pop ebx >@@ -750,8 +775,8 @@ interpolate8x8_avg4_mmx: > psubsw mm0, mm2 > psubsw mm1, mm3 > >- pmullw mm0, [mmx_five] >- pmullw mm1, [mmx_five] >+ pmullw mm0, [ebp + mmx_five wrt ..gotoff] >+ pmullw mm1, [ebp + mmx_five wrt ..gotoff] > > movq mm2, [eax-2] > movq mm4, [eax+3] >@@ -785,13 +810,17 @@ interpolate8x8_avg4_mmx: > ALIGN 16 > interpolate8x8_6tap_lowpass_h_mmx: > >- mov eax, [esp + 16] ; rounding >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, [esp + 20] ; rounding > >- movq mm6, [rounding_lowpass_mmx + eax * 8] >+ movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff] > >- mov ecx, [esp + 4] ; dst -> edi >- mov eax, [esp + 8] ; src -> esi >- mov edx, [esp + 12] ; stride -> edx >+ mov ecx, [esp + 8] ; dst -> edi >+ mov eax, [esp + 12] ; src -> esi >+ mov edx, [esp + 16] ; stride -> edx > > pxor mm7, mm7 > >@@ -811,6 +840,7 @@ interpolate8x8_6tap_lowpass_h_mmx: > lea ecx, [ecx+edx] > LOWPASS_6TAP_H_MMX > >+ pop ebp > ret > .endfunc > >@@ -861,8 +891,8 @@ interpolate8x8_6tap_lowpass_h_mmx: > psubsw mm0, mm2 > psubsw mm1, mm3 > >- pmullw mm0, [mmx_five] >- pmullw mm1, [mmx_five] >+ pmullw mm0, [ebp + mmx_five wrt ..gotoff] >+ pmullw mm1, [ebp + mmx_five wrt ..gotoff] > > movq mm2, [eax+edx] > movq mm4, [eax+2*ebx] >@@ -898,13 +928,17 @@ interpolate8x8_6tap_lowpass_v_mmx: > > push ebx > >- mov eax, [esp + 4 + 16] ; rounding >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov eax, [esp + 8 + 16] ; rounding > >- movq mm6, [rounding_lowpass_mmx + eax * 8] >+ movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff] > >- mov ecx, [esp + 4 + 4] ; dst -> edi >- mov eax, [esp + 4 + 8] ; src -> esi >- mov edx, [esp + 4 + 12] ; stride -> edx >+ mov ecx, [esp + 8 + 4] ; dst -> edi >+ mov eax, [esp + 8 + 8] ; src -> esi >+ mov edx, [esp + 8 + 12] ; stride -> edx > > mov ebx, edx > shl ebx, 1 >@@ -928,6 +962,7 @@ interpolate8x8_6tap_lowpass_v_mmx: > lea ecx, [ecx+edx] > LOWPASS_6TAP_V_MMX > >+ pop ebp > pop ebx > ret > .endfunc >@@ -948,12 +983,17 @@ interpolate8x8_6tap_lowpass_v_mmx: > > %macro PROLOG 2 ; %1: Rounder, %2 load Dst-Rounder > pxor mm6, mm6 >- movq mm7, [%1] ; TODO: dangerous! (eax isn't checked) >+ PROLOG0 >+ >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > %if %2 >- movq mm5, [rounding1_mmx] >+ movq mm5, [ebp + rounding1_mmx wrt ..gotoff] > %endif > >- PROLOG0 >+ movq mm7, [ebp + %1 wrt ..gotoff] ; TODO: dangerous! (eax isn't checked) > %endmacro > > ; performs: mm0 == (mm0+mm2) mm1 == (mm1+mm3) >@@ -1042,6 +1082,7 @@ interpolate8x8_halfpel_add_mmx: > ADD_FF_MMX 1 > ADD_FF_MMX 1 > ADD_FF_MMX 0 >+ pop ebp > ret > .endfunc > >@@ -1088,6 +1129,7 @@ interpolate8x8_halfpel_h_add_mmx: > ADD_FH_MMX > lea ecx,[ecx+edx] > ADD_FH_MMX >+ pop ebp > ret > .endfunc > >@@ -1135,6 +1177,7 @@ interpolate8x8_halfpel_v_add_mmx: > ADD_HF_MMX > lea ecx,[ecx+edx] > ADD_HF_MMX >+ pop ebp > ret > .endfunc > >@@ -1200,8 +1243,8 @@ interpolate8x8_halfpel_v_add_mmx: > paddusw mm0, mm4 ; mix Src(mm0/mm1) with Dst(mm2/mm3) > paddusw mm1, mm5 > >- paddusw mm0, [rounding1_mmx] >- paddusw mm1, [rounding1_mmx] >+ paddusw mm0, [ebp + rounding1_mmx wrt ..gotoff] >+ paddusw mm1, [ebp + rounding1_mmx wrt ..gotoff] > > psrlw mm0, 1 > psrlw mm1, 1 >@@ -1211,6 +1254,11 @@ interpolate8x8_halfpel_v_add_mmx: > movq [ecx], mm0 > %endmacro > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bp: >+ mov ebp, [esp] >+ retn >+ > ALIGN 16 > interpolate8x8_halfpel_hv_add_mmx: > PROLOG rounding2_mmx, 0 ; mm5 is busy. Don't load dst-rounder >@@ -1246,6 +1294,9 @@ interpolate8x8_halfpel_hv_add_mmx: > lea ecx,[ecx+edx] > ADD_HH_MMX > >+ pop ebp > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_xmm.asm xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_xmm.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/interpolate8x8_xmm.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/interpolate8x8_xmm.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -42,20 +42,6 @@ BITS 32 > %endif > %endmacro > >-;============================================================================= >-; Read only data >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >-mmx_one: >- times 8 db 1 >- > SECTION .text > > cglobal interpolate8x8_halfpel_h_xmm >@@ -128,7 +114,10 @@ interpolate8x8_halfpel_h_xmm: > > .rounding1 > ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > COPY_H_SSE_RND1 > lea ecx, [ecx+2*edx] > COPY_H_SSE_RND1 >@@ -200,7 +189,10 @@ interpolate8x8_halfpel_v_xmm: > > .rounding1 > ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > movq mm2, [eax] ; loop invariant > add eax, edx > >@@ -322,7 +314,10 @@ interpolate8x8_halfpel_hv_xmm: > mov eax, [esp+ 8] ; Src > mov edx, [esp+12] ; stride > >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > > ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j > movq mm2, [eax] >@@ -455,8 +450,8 @@ interpolate8x8_halfpel_add_xmm: ; 23c > pxor mm2, mm4 > pavgb mm1, mm3 > pxor mm3, mm5 >- pand mm2, [mmx_one] >- pand mm3, [mmx_one] >+ pand mm2, [esp] >+ pand mm3, [esp] > psubb mm0, mm2 > psubb mm1, mm3 > pavgb mm0, [ecx+%1] >@@ -484,6 +479,8 @@ interpolate8x8_halfpel_h_add_xmm: ; 32 > .Loop1 > ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 > ; movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 > ADD_FH_RND1 0, edx > lea eax,[eax+2*edx] > lea ecx,[ecx+2*edx] >@@ -494,6 +491,7 @@ interpolate8x8_halfpel_h_add_xmm: ; 32 > lea eax,[eax+2*edx] > lea ecx,[ecx+2*edx] > ADD_FH_RND1 0, edx >+ add esp, byte 8 > EPILOG > .endfunc > >@@ -558,7 +556,10 @@ interpolate8x8_halfpel_v_add_xmm: > > .Loop1 > movq mm0, [eax] ; loop invariant >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] >+ add esp, byte 8 > > ADD_8_HF_RND1 > movq mm0, mm2 >@@ -681,7 +682,9 @@ ALIGN 16 > interpolate8x8_halfpel_hv_add_xmm: > PROLOG1 > >- movq mm7, [mmx_one] >+ push dword 0x01010101 >+ push dword 0x01010101 >+ movq mm7, [esp] > > ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j > movq mm2, [eax] >@@ -710,6 +713,9 @@ interpolate8x8_halfpel_hv_add_xmm: > add ecx, edx > ADD_HH_RND1 > >+ add esp, byte 8 > EPILOG > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/postprocessing_mmx.asm xvidcore-1.1.0-beta2/src/image/x86_asm/postprocessing_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/postprocessing_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/postprocessing_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -70,6 +70,11 @@ mmx_offset: > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bp: >+ mov ebp, [esp] >+ retn >+ > cglobal image_brightness_mmx > > >@@ -83,16 +88,19 @@ image_brightness_mmx: > push esi > push edi > >- movq mm6, [mmx_0x80] >- > mov eax, [esp+8+20] ; offset >- movq mm7, [mmx_offset + (eax + 128)*8] ; being lazy >- > mov edx, [esp+8+4] ; Dst > mov ecx, [esp+8+8] ; stride > mov esi, [esp+8+12] ; width > mov edi, [esp+8+16] ; height > >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ movq mm6, [ebp + mmx_0x80 wrt ..gotoff] >+ movq mm7, [ebp + mmx_offset + (eax + 128)*8 wrt ..gotoff] ; being lazy >+ pop ebp >+ > .yloop > xor eax, eax > >@@ -124,3 +132,6 @@ image_brightness_mmx: > ret > .endfunc > ;////////////////////////////////////////////////////////////////////// >+ >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/postprocessing_sse2.asm xvidcore-1.1.0-beta2/src/image/x86_asm/postprocessing_sse2.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/postprocessing_sse2.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/postprocessing_sse2.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -42,19 +42,6 @@ BITS 32 > %endif > %endmacro > >-;=========================================================================== >-; read only data >-;=========================================================================== >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-xmm_0x80: >- times 16 db 0x80 >- > ;============================================================================= > ; Code > ;============================================================================= >@@ -93,7 +80,12 @@ image_brightness_sse2: > push edi ; 8 bytes offset for push > sub esp, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16) > >- movdqa xmm6, [xmm_0x80] >+ push dword 0x80808080 >+ push dword 0x80808080 >+ push dword 0x80808080 >+ push dword 0x80808080 >+ movdqa xmm6, [esp] >+ add esp, byte 16 > > ; Create a offset...offset vector > mov eax, [esp+8+32+20] ; brightness offset value >@@ -140,3 +132,6 @@ image_brightness_sse2: > ret > .endfunc > ;////////////////////////////////////////////////////////////////////// >+ >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/qpel_mmx.asm xvidcore-1.1.0-beta2/src/image/x86_asm/qpel_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/qpel_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/qpel_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -201,6 +201,11 @@ FIR_C23: times 4 dw 23 > > SECTION .text > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.cx: >+ mov ecx, [esp] >+ retn >+ > ;////////////////////////////////////////////////////////////////////// > ;// Here we go with the Q-Pel mess. > ;// For horizontal passes, we process 4 *output* pixel in parallel >@@ -208,19 +213,22 @@ SECTION .text > ;////////////////////////////////////////////////////////////////////// > > %macro PROLOG_NO_AVRG 0 >+ push ebx > push esi > push edi > push ebp >- mov edi, [esp+16 + 0*4] ; Dst >- mov esi, [esp+16 + 1*4] ; Src >- mov ecx, [esp+16 + 2*4] ; Size >- mov ebp, [esp+16 + 3*4] ; BpS >- mov eax, [esp+16 + 4*4] ; Rnd >+ mov edi, [esp+20 + 0*4] ; Dst >+ mov esi, [esp+20 + 1*4] ; Src >+ mov ebp, [esp+20 + 3*4] ; BpS >+ mov eax, [esp+20 + 4*4] ; Rnd > and eax, 1 >- movq mm7, [Rounder_QP_MMX+eax*8] ; rounder >+ call get_pc.cx >+ add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff] ; rounder > %endmacro > > %macro EPILOG_NO_AVRG 0 >+ pop ebx > pop ebp > pop edi > pop esi >@@ -234,12 +242,13 @@ SECTION .text > push ebp > mov edi, [esp+20 + 0*4] ; Dst > mov esi, [esp+20 + 1*4] ; Src >- mov ecx, [esp+20 + 2*4] ; Size > mov ebp, [esp+20 + 3*4] ; BpS > mov eax, [esp+20 + 4*4] ; Rnd > and eax, 1 >- movq mm7, [Rounder_QP_MMX+eax*8] ; rounder >- lea ebx, [Rounder1_MMX+eax*8] ; *Rounder2 >+ call get_pc.cx >+ add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff] ; rounder >+ lea ebx, [ecx + Rounder1_MMX+eax*8 wrt ..gotoff] ; *Rounder2 > %endmacro > > %macro EPILOG_AVRG 0 >@@ -261,23 +270,23 @@ SECTION .text > %macro TLOAD 2 ; %1,%2: src pixels > movzx eax, byte [esi+%1] > movzx edx, byte [esi+%2] >- movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] >- movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] >+ movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff] >+ movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff] > paddw mm0, mm7 > paddw mm3, mm7 > %endmacro > > %macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs > movzx eax, byte [esi+%1] >- paddw %4, [%2 + eax*8] >- paddw %5, [%3 + eax*8] >+ paddw %4, [eax*8 + %2] >+ paddw %5, [eax*8 + %3] > %endmacro > > %macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs > movzx eax, byte [esi+%1] >- paddw %5, [%2 + eax*8] >- paddw %6, [%3 + eax*8] >- paddw %7, [%4 + eax*8] >+ paddw %5, [eax*8 + %2] >+ paddw %6, [eax*8 + %3] >+ paddw %7, [eax*8 + %4] > %endmacro > > ;////////////////////////////////////////////////////////////////////// >@@ -287,32 +296,32 @@ SECTION .text > %macro LOAD 2 ; %1,%2: src pixels > movzx eax, byte [esi+%1] > movzx edx, byte [esi+%2] >- movq mm0, [xvid_Expand_mmx + eax*8] >- movq mm3, [xvid_Expand_mmx + edx*8] >- pmullw mm0, [FIR_R0 ] >- pmullw mm3, [FIR_R16] >+ movq mm0, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff] >+ movq mm3, [ecx + xvid_Expand_mmx + edx*8 wrt ..gotoff] >+ pmullw mm0, [ecx + FIR_R0 wrt ..gotoff] >+ pmullw mm3, [ecx + FIR_R16 wrt ..gotoff] > paddw mm0, mm7 > paddw mm3, mm7 > %endmacro > > %macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 > movzx eax, byte [esi+%1] >- movq mm4, [xvid_Expand_mmx + eax*8] >+ movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff] > movq mm5, mm4 > pmullw mm4, [%2] >- pmullw mm5, [%2+8] >+ pmullw mm5, [8+%2] > paddw %3, mm4 > paddw %4, mm5 > %endmacro > > %macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 > movzx eax, byte [esi+%1] >- movq mm4, [xvid_Expand_mmx + eax*8] >+ movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff] > movq mm5, mm4 > movq mm6, mm5 >- pmullw mm4, [%2 ] >- pmullw mm5, [%2+ 8] >- pmullw mm6, [%2+16] >+ pmullw mm4, [ %2] >+ pmullw mm5, [ 8+%2] >+ pmullw mm6, [16+%2] > paddw %3, mm4 > paddw %4, mm5 > paddw %5, mm6 >@@ -359,23 +368,23 @@ SECTION .text > movq mm1, mm7 > movq mm2, mm7 > >- ACCUM2 1, FIR_R1, mm0, mm1 >- ACCUM2 2, FIR_R2, mm0, mm1 >- ACCUM2 3, FIR_R3, mm0, mm1 >- ACCUM2 4, FIR_R4, mm0, mm1 >- >- ACCUM3 5, FIR_R5, mm0, mm1, mm2 >- ACCUM3 6, FIR_R6, mm0, mm1, mm2 >- ACCUM3 7, FIR_R7, mm0, mm1, mm2 >- ACCUM2 8, FIR_R8, mm1, mm2 >- ACCUM3 9, FIR_R9, mm1, mm2, mm3 >- ACCUM3 10, FIR_R10,mm1, mm2, mm3 >- ACCUM3 11, FIR_R11,mm1, mm2, mm3 >- >- ACCUM2 12, FIR_R12, mm2, mm3 >- ACCUM2 13, FIR_R13, mm2, mm3 >- ACCUM2 14, FIR_R14, mm2, mm3 >- ACCUM2 15, FIR_R15, mm2, mm3 >+ ACCUM2 1, ecx + FIR_R1 wrt ..gotoff, mm0, mm1 >+ ACCUM2 2, ecx + FIR_R2 wrt ..gotoff, mm0, mm1 >+ ACCUM2 3, ecx + FIR_R3 wrt ..gotoff, mm0, mm1 >+ ACCUM2 4, ecx + FIR_R4 wrt ..gotoff, mm0, mm1 >+ >+ ACCUM3 5, ecx + FIR_R5 wrt ..gotoff, mm0, mm1, mm2 >+ ACCUM3 6, ecx + FIR_R6 wrt ..gotoff, mm0, mm1, mm2 >+ ACCUM3 7, ecx + FIR_R7 wrt ..gotoff, mm0, mm1, mm2 >+ ACCUM2 8, ecx + FIR_R8 wrt ..gotoff, mm1, mm2 >+ ACCUM3 9, ecx + FIR_R9 wrt ..gotoff, mm1, mm2, mm3 >+ ACCUM3 10, ecx + FIR_R10 wrt ..gotoff,mm1, mm2, mm3 >+ ACCUM3 11, ecx + FIR_R11 wrt ..gotoff,mm1, mm2, mm3 >+ >+ ACCUM2 12, ecx + FIR_R12 wrt ..gotoff, mm2, mm3 >+ ACCUM2 13, ecx + FIR_R13 wrt ..gotoff, mm2, mm3 >+ ACCUM2 14, ecx + FIR_R14 wrt ..gotoff, mm2, mm3 >+ ACCUM2 15, ecx + FIR_R15 wrt ..gotoff, mm2, mm3 > > %else > >@@ -383,25 +392,25 @@ SECTION .text > movq mm1, mm7 > movq mm2, mm7 > >- TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1 >- TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1 >- TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1 >- TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1 >- >- TACCUM3 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0 , mm0, mm1, mm2 >- TACCUM3 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1, mm2 >- TACCUM3 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1, mm2 >- >- TACCUM2 8, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm1, mm2 >- >- TACCUM3 9, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0, mm1, mm2, mm3 >- TACCUM3 10, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0, mm1, mm2, mm3 >- TACCUM3 11, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0, mm1, mm2, mm3 >- >- TACCUM2 12, xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3 >- TACCUM2 13, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3 >- TACCUM2 14, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3 >- TACCUM2 15, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3 >+ TACCUM2 1, ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1 >+ TACCUM2 2, ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1 >+ TACCUM2 3, ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1 >+ TACCUM2 4, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff, mm0, mm1 >+ >+ TACCUM3 5, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1, mm2 >+ TACCUM3 6, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1, mm2 >+ TACCUM3 7, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1, mm2 >+ >+ TACCUM2 8, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , xvid_FIR_20_6_3_1 wrt ..gotoff , mm1, mm2 >+ >+ TACCUM3 9, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff, mm1, mm2, mm3 >+ TACCUM3 10, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff, mm1, mm2, mm3 >+ TACCUM3 11, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff, mm1, mm2, mm3 >+ >+ TACCUM2 12, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff, ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm2, mm3 >+ TACCUM2 13, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm2, mm3 >+ TACCUM2 14, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm2, mm3 >+ TACCUM2 15, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm2, mm3 > > %endif > >@@ -418,7 +427,7 @@ SECTION .text > MIX mm0, esi+1, ebx > %endif > %if (%2==1) >- MIX mm0, edi, Rounder1_MMX >+ MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff > %endif > > %if (%1==1) >@@ -427,7 +436,7 @@ SECTION .text > MIX mm2, esi+9, ebx > %endif > %if (%2==1) >- MIX mm2, edi+8, Rounder1_MMX >+ MIX mm2, edi+8, ecx + Rounder1_MMX wrt ..gotoff > %endif > > lea esi, [esi+ebp] >@@ -436,7 +445,7 @@ SECTION .text > movq [edi+8], mm2 > > add edi, ebp >- dec ecx >+ dec dword [esp+20 + 2*4] > jg .Loop > > %if (%2==0) && (%1==0) >@@ -464,64 +473,64 @@ SECTION .text > %ifndef USE_TABLES > > LOAD 0, 8 ; special case for 1rst/last pixel >- ACCUM2 1, FIR_R1, mm0, mm3 >- ACCUM2 2, FIR_R2, mm0, mm3 >- ACCUM2 3, FIR_R3, mm0, mm3 >- ACCUM2 4, FIR_R4, mm0, mm3 >- >- ACCUM2 5, FIR_R13, mm0, mm3 >- ACCUM2 6, FIR_R14, mm0, mm3 >- ACCUM2 7, FIR_R15, mm0, mm3 >+ ACCUM2 1, ecx + FIR_R1 wrt ..gotoff, mm0, mm3 >+ ACCUM2 2, ecx + FIR_R2 wrt ..gotoff, mm0, mm3 >+ ACCUM2 3, ecx + FIR_R3 wrt ..gotoff, mm0, mm3 >+ ACCUM2 4, ecx + FIR_R4 wrt ..gotoff, mm0, mm3 >+ >+ ACCUM2 5, ecx + FIR_R13 wrt ..gotoff, mm0, mm3 >+ ACCUM2 6, ecx + FIR_R14 wrt ..gotoff, mm0, mm3 >+ ACCUM2 7, ecx + FIR_R15 wrt ..gotoff, mm0, mm3 > > %else > > %if 0 ; test with no unrolling > > TLOAD 0, 8 ; special case for 1rst/last pixel >- TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm3 >- TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm3 >- TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm3 >- TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3 >- TACCUM2 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm0, mm3 >- TACCUM2 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm0, mm3 >- TACCUM2 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm0, mm3 >+ TACCUM2 1, ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm3 >+ TACCUM2 2, ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm3 >+ TACCUM2 3, ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm3 >+ TACCUM2 4, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm0, mm3 >+ TACCUM2 5, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm0, mm3 >+ TACCUM2 6, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm0, mm3 >+ TACCUM2 7, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm0, mm3 > > %else ; test with unrolling (little faster, but not much) > > movzx eax, byte [esi] > movzx edx, byte [esi+8] >- movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] >+ movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff] > movzx eax, byte [esi+1] >- movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] >+ movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff] > paddw mm0, mm7 > paddw mm3, mm7 > > movzx edx, byte [esi+2] >- paddw mm0, [xvid_FIR_23_19_6_3 + eax*8] >- paddw mm3, [xvid_FIR_1_0_0_0 + eax*8] >+ paddw mm0, [ecx + xvid_FIR_23_19_6_3 + eax*8 wrt ..gotoff] >+ paddw mm3, [ecx + xvid_FIR_1_0_0_0 + eax*8 wrt ..gotoff] > > movzx eax, byte [esi+3] >- paddw mm0, [xvid_FIR_7_20_20_6 + edx*8] >- paddw mm3, [xvid_FIR_3_1_0_0 + edx*8] >+ paddw mm0, [ecx + xvid_FIR_7_20_20_6 + edx*8 wrt ..gotoff] >+ paddw mm3, [ecx + xvid_FIR_3_1_0_0 + edx*8 wrt ..gotoff] > > movzx edx, byte [esi+4] >- paddw mm0, [xvid_FIR_3_6_20_20 + eax*8] >- paddw mm3, [xvid_FIR_6_3_1_0 + eax*8] >+ paddw mm0, [ecx + xvid_FIR_3_6_20_20 + eax*8 wrt ..gotoff] >+ paddw mm3, [ecx + xvid_FIR_6_3_1_0 + eax*8 wrt ..gotoff] > > movzx eax, byte [esi+5] >- paddw mm0, [xvid_FIR_1_3_6_20 + edx*8] >- paddw mm3, [xvid_FIR_20_6_3_1 + edx*8] >+ paddw mm0, [ecx + xvid_FIR_1_3_6_20 + edx*8 wrt ..gotoff] >+ paddw mm3, [ecx + xvid_FIR_20_6_3_1 + edx*8 wrt ..gotoff] > > movzx edx, byte [esi+6] >- paddw mm0, [xvid_FIR_0_1_3_6 + eax*8] >- paddw mm3, [xvid_FIR_20_20_6_3 + eax*8] >+ paddw mm0, [ecx + xvid_FIR_0_1_3_6 + eax*8 wrt ..gotoff] >+ paddw mm3, [ecx + xvid_FIR_20_20_6_3 + eax*8 wrt ..gotoff] > > movzx eax, byte [esi+7] >- paddw mm0, [xvid_FIR_0_0_1_3 + edx*8] >- paddw mm3, [xvid_FIR_6_20_20_7 + edx*8] >+ paddw mm0, [ecx + xvid_FIR_0_0_1_3 + edx*8 wrt ..gotoff] >+ paddw mm3, [ecx + xvid_FIR_6_20_20_7 + edx*8 wrt ..gotoff] > >- paddw mm0, [xvid_FIR_0_0_0_1 + eax*8] >- paddw mm3, [xvid_FIR_3_6_19_23 + eax*8] >+ paddw mm0, [ecx + xvid_FIR_0_0_0_1 + eax*8 wrt ..gotoff] >+ paddw mm3, [ecx + xvid_FIR_3_6_19_23 + eax*8 wrt ..gotoff] > > %endif > >@@ -537,14 +546,14 @@ SECTION .text > MIX mm0, esi+1, ebx > %endif > %if (%2==1) >- MIX mm0, edi, Rounder1_MMX >+ MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff > %endif > > movq [edi], mm0 > > add edi, ebp > add esi, ebp >- dec ecx >+ dec dword [esp+20 + 2*4] > jg .Loop > > %if (%2==0) && (%1==0) >@@ -678,7 +687,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > V_MIX %3, esi, ebx > %endif > %if (%2==1) >- V_MIX %3, edi, Rounder1_MMX >+ V_MIX %3, edi, ecx + Rounder1_MMX wrt ..gotoff > %endif > > movd eax, %3 >@@ -718,28 +727,28 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > movq mm3, mm7 > > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_STORE %1, %2, mm0, 0 > > V_LOAD 0 >- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 >- V_ACC1 mm3, FIR_Cm6 >+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff > V_STORE %1, %2, mm1, 0 > > V_LOAD 0 >- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 >+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_STORE %1, %2, mm2, 0 > > V_LOAD 1 >- V_ACC1 mm3, FIR_Cm1 >+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff > V_STORE %1, %2, mm3, 0 > > ; ouput rows [4..7], from input rows [1..11] (!!) >@@ -756,38 +765,38 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > movq mm3, mm7 > > V_LOAD 0 >- V_ACC1 mm0, FIR_Cm1 >+ V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 >+ V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 >- V_ACC1 mm2, FIR_Cm1 >+ V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_STORE %1, %2, mm0, 0 > > V_LOAD 0 >- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 >- V_ACC1 mm3, FIR_Cm6 >+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff > V_STORE %1, %2, mm1, 0 > > V_LOAD 0 >- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 >+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_STORE %1, %2, mm2, 0 > > V_LOAD 1 >- V_ACC1 mm3, FIR_Cm1 >+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff > V_STORE %1, %2, mm3, 0 > > ; ouput rows [8..11], from input rows [5..15] >@@ -804,39 +813,39 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > movq mm3, mm7 > > V_LOAD 0 >- V_ACC1 mm0, FIR_Cm1 >+ V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 >+ V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 >- V_ACC1 mm2, FIR_Cm1 >+ V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > > V_STORE %1, %2, mm0, 0 > > V_LOAD 0 >- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 >- V_ACC1 mm3, FIR_Cm6 >+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff > V_STORE %1, %2, mm1, 0 > > V_LOAD 0 >- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 >+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_STORE %1, %2, mm2, 0 > > V_LOAD 1 >- V_ACC1 mm3, FIR_Cm1 >+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff > V_STORE %1, %2, mm3, 0 > > >@@ -855,25 +864,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > movq mm3, mm7 > > V_LOAD 0 >- V_ACC1 mm3, FIR_Cm1 >+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 >+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > > V_LOAD 0 >- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 >- V_ACC1 mm3, FIR_Cm6 >+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff > > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_LOAD 1 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > > V_STORE %1, %2, mm3, 0 > V_STORE %1, %2, mm2, 0 >@@ -886,7 +895,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > pop edi > add esi, 4 > add edi, 4 >- sub ecx, 4 >+ sub dword [esp+20 + 2*4], 4 > jg .Loop > > %if (%2==0) && (%1==0) >@@ -924,29 +933,29 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > movq mm3, mm7 > > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_STORE %1, %2, mm0, 0 > > V_LOAD 0 >- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 >- V_ACC1 mm3, FIR_Cm6 >+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff > > V_STORE %1, %2, mm1, 0 > > V_LOAD 0 >- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 >+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_STORE %1, %2, mm2, 0 > > V_LOAD 1 >- V_ACC1 mm3, FIR_Cm1 >+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff > V_STORE %1, %2, mm3, 0 > > ; ouput rows [4..7], from input rows [1..9] >@@ -964,25 +973,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > movq mm3, mm7 > > V_LOAD 0 >- V_ACC1 mm3, FIR_Cm1 >+ V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff > > V_LOAD 0 >- V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 >+ V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > > V_LOAD 0 >- V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 >- V_ACC1 mm3, FIR_Cm6 >+ V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff >+ V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff > > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff > V_LOAD 0 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff > V_LOAD 1 >- V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 >+ V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff > > V_STORE %1, %2, mm3, 0 > V_STORE %1, %2, mm2, 0 >@@ -995,7 +1004,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: > pop edi > add esi, 4 > add edi, 4 >- sub ecx, 4 >+ sub dword [esp+20 + 2*4], 4 > jg .Loop > > %if (%2==0) && (%1==0) >@@ -1060,3 +1069,6 @@ xvid_V_Pass_Avrg_Up_8_Add_mmx: > .endfunc > > ;////////////////////////////////////////////////////////////////////// >+ >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/image/x86_asm/reduced_mmx.asm xvidcore-1.1.0-beta2/src/image/x86_asm/reduced_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/image/x86_asm/reduced_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/image/x86_asm/reduced_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -91,8 +91,8 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm > pmullw mm4, %4 ; [Up31] > pmullw %2, %3 ; [Up13] > pmullw mm5, %4 ; [Up31] >- paddsw %1, [Cst2] >- paddsw %2, [Cst2] >+ paddsw %1, [ebp + Cst2 wrt ..gotoff] >+ paddsw %2, [ebp + Cst2 wrt ..gotoff] > paddsw %1, mm4 > paddsw %2, mm5 > %endmacro >@@ -126,14 +126,14 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm > > %macro MIX_ROWS 4 ; %1/%2:prev %3/4:cur (preserved) mm4/mm5: output > ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved. >- movq mm4, [Cst3] >- movq mm5, [Cst3] >+ movq mm4, [ebp + Cst3 wrt ..gotoff] >+ movq mm5, [ebp + Cst3 wrt ..gotoff] > pmullw mm4, %3 > pmullw mm5, %4 > paddsw mm4, %1 > paddsw mm5, %2 >- pmullw %1, [Cst3] >- pmullw %2, [Cst3] >+ pmullw %1, [ebp + Cst3 wrt ..gotoff] >+ pmullw %2, [ebp + Cst3 wrt ..gotoff] > paddsw %1, %3 > paddsw %2, %4 > %endmacro >@@ -176,8 +176,12 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 > mov edx, [esp+8] ; Src > mov eax, [esp+12] ; BpS > >- movq mm6, [Up13] >- movq mm7, [Up31] >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm6, [ebp + Up13 wrt ..gotoff] >+ movq mm7, [ebp + Up31 wrt ..gotoff] > > COL03 mm0, mm1, 0 > MUL_PACK mm0,mm1, mm6, mm7 >@@ -223,7 +227,7 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 > > STORE_1 mm2, mm3 > >- mov ecx, [esp+4] >+ mov ecx, [esp+8] > add ecx, 8 > > COL47 mm0, mm1, 0 >@@ -270,6 +274,7 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 > > STORE_1 mm2, mm3 > >+ pop ebp > ret > .endfunc > >@@ -292,8 +297,8 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 > ; (x*4 + 2)/4 = x - (x<0) > ; So, better revert to (x*4)/4 = x. > >- psubsw %1, [Cst2000] >- psubsw %2, [Cst0002] >+ psubsw %1, [ebp + Cst2000 wrt ..gotoff] >+ psubsw %2, [ebp + Cst0002 wrt ..gotoff] > pxor mm6, mm6 > pxor mm7, mm7 > pcmpgtw mm6, %1 >@@ -308,8 +313,8 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 > ; mix with destination [ecx] > movq mm6, [ecx] > movq mm7, [ecx] >- punpcklbw mm6, [Cst0] >- punpckhbw mm7, [Cst0] >+ punpcklbw mm6, [ebp + Cst0 wrt ..gotoff] >+ punpckhbw mm7, [ebp + Cst0 wrt ..gotoff] > paddsw %1, mm6 > paddsw %2, mm7 > packuswb %1,%2 >@@ -342,16 +347,16 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 > ; mix with destination > movq mm6, [ecx] > movq mm7, [ecx] >- punpcklbw mm6, [Cst0] >- punpckhbw mm7, [Cst0] >+ punpcklbw mm6, [ebp + Cst0 wrt ..gotoff] >+ punpckhbw mm7, [ebp + Cst0 wrt ..gotoff] > paddsw %1, mm6 > paddsw %2, mm7 > > movq mm6, [ecx+eax] > movq mm7, [ecx+eax] > >- punpcklbw mm6, [Cst0] >- punpckhbw mm7, [Cst0] >+ punpcklbw mm6, [ebp + Cst0 wrt ..gotoff] >+ punpckhbw mm7, [ebp + Cst0 wrt ..gotoff] > paddsw mm4, mm6 > paddsw mm5, mm7 > >@@ -373,98 +378,103 @@ xvid_Add_Upsampled_8x8_16To8_mmx: ; 579 > mov edx, [esp+8] ; Src > mov eax, [esp+12] ; BpS > >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > COL03 mm0, mm1, 0 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > movq mm4, mm0 > movq mm5, mm1 > STORE_ADD_1 mm4, mm5 > add ecx, eax > > COL03 mm2, mm3, 1 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL03 mm0, mm1, 2 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL03 mm2, mm3, 3 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL03 mm0, mm1, 4 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL03 mm2, mm3, 5 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL03 mm0, mm1, 6 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL03 mm2, mm3, 7 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > STORE_ADD_1 mm2, mm3 > > >- mov ecx, [esp+4] >+ mov ecx, [esp+8] > add ecx, 8 > > COL47 mm0, mm1, 0 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > movq mm4, mm0 > movq mm5, mm1 > STORE_ADD_1 mm4, mm5 > add ecx, eax > > COL47 mm2, mm3, 1 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL47 mm0, mm1, 2 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL47 mm2, mm3, 3 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL47 mm0, mm1, 4 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL47 mm2, mm3, 5 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL47 mm0, mm1, 6 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL47 mm2, mm3, 7 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > STORE_ADD_1 mm2, mm3 > >+ pop ebp > ret > .endfunc > >@@ -503,8 +513,12 @@ xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 > mov edx, [esp+8] ; Src > mov eax, [esp+12] ; BpS > >- movq mm6, [Up13] >- movq mm7, [Up31] >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm6, [ebp + Up13 wrt ..gotoff] >+ movq mm7, [ebp + Up31 wrt ..gotoff] > > COL03_SSE mm0, mm1, 0 > MUL_PACK mm0,mm1, mm6, mm7 >@@ -550,7 +564,7 @@ xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 > > STORE_1 mm2, mm3 > >- mov ecx, [esp+4] >+ mov ecx, [esp+8] > add ecx, 8 > > COL47_SSE mm0, mm1, 0 >@@ -597,6 +611,7 @@ xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 > > STORE_1 mm2, mm3 > >+ pop ebp > ret > .endfunc > >@@ -614,98 +629,103 @@ xvid_Add_Upsampled_8x8_16To8_xmm: ; 549 > mov edx, [esp+8] ; Src > mov eax, [esp+12] ; BpS > >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > COL03_SSE mm0, mm1, 0 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > movq mm4, mm0 > movq mm5, mm1 > STORE_ADD_1 mm4, mm5 > add ecx, eax > > COL03_SSE mm2, mm3, 1 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL03_SSE mm0, mm1, 2 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL03_SSE mm2, mm3, 3 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL03_SSE mm0, mm1, 4 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL03_SSE mm2, mm3, 5 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL03_SSE mm0, mm1, 6 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL03_SSE mm2, mm3, 7 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > STORE_ADD_1 mm2, mm3 > > >- mov ecx, [esp+4] >+ mov ecx, [esp+8] > add ecx, 8 > > COL47_SSE mm0, mm1, 0 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > movq mm4, mm0 > movq mm5, mm1 > STORE_ADD_1 mm4, mm5 > add ecx, eax > > COL47_SSE mm2, mm3, 1 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL47_SSE mm0, mm1, 2 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL47_SSE mm2, mm3, 3 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL47_SSE mm0, mm1, 4 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL47_SSE mm2, mm3, 5 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > COL47_SSE mm0, mm1, 6 >- MUL_PACK mm0,mm1, [Up13], [Up31] >+ MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm2, mm3, mm0, mm1 > STORE_ADD_2 mm2, mm3 > > COL47_SSE mm2, mm3, 7 >- MUL_PACK mm2,mm3, [Up13], [Up31] >+ MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] > MIX_ROWS mm0, mm1, mm2, mm3 > STORE_ADD_2 mm0, mm1 > > STORE_ADD_1 mm2, mm3 > >+ pop ebp > ret > .endfunc > >@@ -732,7 +752,10 @@ xvid_HFilter_31_mmx: > mov edi, [esp+8 +8] ; Src2 > mov eax, [esp+12 +8] ; Nb_Blks > lea eax,[eax*2] >- movq mm5, [Cst2] >+ push dword 0x00020002 >+ push dword 0x00020002 >+ movq mm5, [esp] ; Cst2 >+ add esp, byte 8 > pxor mm7, mm7 > > lea esi, [esi+eax*4] >@@ -848,7 +871,7 @@ xvid_HFilter_31_x86: > ;////////////////////////////////////////////////////////////////////// > > %macro HFILTER_1331 2 ;%1:src %2:dst reg. -trashes mm0/mm1/mm2 >- movq mm2, [Mask_ff] >+ movq mm2, [ebp + Mask_ff wrt ..gotoff] > movq %2, [%1-1] ;-10123456 > movq mm0, [%1] ; 01234567 > movq mm1, [%1+1] ; 12345678 >@@ -863,7 +886,7 @@ xvid_HFilter_31_x86: > %endmacro > > %macro VFILTER_1331 4 ; %1-4: regs %1-%2: trashed >- paddsw %1, [Cst32] >+ paddsw %1, [ebp + Cst32 wrt ..gotoff] > paddsw %2, %3 > pmullw %2, mm7 > paddsw %1,%4 >@@ -899,7 +922,11 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c > mov edx, [esp+8] ; Src > mov eax, [esp+12] ; BpS > >- movq mm7, [Cst3] >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm7, [ebp + Cst3 wrt ..gotoff] > sub edx, eax > > ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. >@@ -917,7 +944,7 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c > > ; process columns 4-7 > >- mov edx, [esp+8] >+ mov edx, [esp+12] > sub edx, eax > add edx, 8 > >@@ -930,6 +957,7 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c > COPY_TWO_LINES_1331 ecx + 4*16 +8 > COPY_TWO_LINES_1331 ecx + 6*16 +8 > >+ pop ebp > ret > .endfunc > >@@ -958,6 +986,11 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c > movq [%1+16], mm2 > %endmacro > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bp: >+ mov ebp, [esp] >+ retn >+ > align 16 > xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c > >@@ -965,7 +998,11 @@ xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 > mov edx, [esp+8] ; Src > mov eax, [esp+12] ; BpS > >- movq mm7, [Cst3] >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm7, [ebp + Cst3 wrt ..gotoff] > sub edx, eax > > ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. >@@ -982,7 +1019,7 @@ xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 > DIFF_TWO_LINES_1331 ecx + 6*16 > > ; process columns 4-7 >- mov edx, [esp+8] >+ mov edx, [esp+12] > sub edx, eax > add edx, 8 > >@@ -995,6 +1032,7 @@ xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 > DIFF_TWO_LINES_1331 ecx + 4*16 +8 > DIFF_TWO_LINES_1331 ecx + 6*16 +8 > >+ pop ebp > ret > .endfunc > >@@ -1002,3 +1040,5 @@ xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 > > ; pfeewwww... Never Do That On Stage Again. :) > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_3dn.asm xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_3dn.asm >--- xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_3dn.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_3dn.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -44,20 +44,6 @@ BITS 32 > %endmacro > > ;============================================================================= >-; Read only data >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >-mmx_one: >- times 4 dw 1 >- >-;============================================================================= > ; Helper macros > ;============================================================================= > %macro SADBI_16x16_3DN 0 >@@ -179,7 +165,10 @@ sad16bi_3dn: > SADBI_16x16_3DN > SADBI_16x16_3DN > >- pmaddwd mm6, [mmx_one] ; collapse >+ push dword 0x00010001 >+ push dword 0x00010001 >+ pmaddwd mm6, [esp] ; collapse >+ add esp, byte 8 > movq mm7, mm6 > psrlq mm7, 32 > paddd mm6, mm7 >@@ -216,7 +205,10 @@ sad8bi_3dn: > SADBI_8x8_3DN > SADBI_8x8_3DN > >- pmaddwd mm6, [mmx_one] ; collapse >+ push dword 0x00010001 >+ push dword 0x00010001 >+ pmaddwd mm6, [esp] ; collapse >+ add esp, byte 8 > movq mm7, mm6 > psrlq mm7, 32 > paddd mm6, mm7 >@@ -228,3 +220,5 @@ sad8bi_3dn: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_3dne.asm xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_3dne.asm >--- xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_3dne.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_3dne.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -502,3 +502,5 @@ ALIGN 8 > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_mmx.asm xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -45,20 +45,6 @@ BITS 32 > %endmacro > > ;============================================================================= >-; Read only data >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >-mmx_one: >- times 4 dw 1 >- >-;============================================================================= > ; Helper macros > ;============================================================================= > >@@ -181,8 +167,8 @@ mmx_one: > > paddusw mm0, mm2 ; mm01 = ref1 + ref2 > paddusw mm1, mm3 >- paddusw mm0, [mmx_one] ; mm01 += 1 >- paddusw mm1, [mmx_one] >+ paddusw mm0, [esp] ; mm01 += 1 >+ paddusw mm1, [esp] > psrlw mm0, 1 ; mm01 >>= 1 > psrlw mm1, 1 > >@@ -314,7 +300,7 @@ sad16_mmx: > SAD_16x16_MMX > SAD_16x16_MMX > >- pmaddwd mm6, [mmx_one] ; collapse >+ pmaddwd mm6, [esp] ; collapse > movq mm7, mm6 > psrlq mm7, 32 > paddd mm6, mm7 >@@ -339,6 +325,9 @@ sad8_mmx: > mov edx, [esp+ 8] ; Src2 > mov ecx, [esp+12] ; Stride > >+ push dword 0x00010001 >+ push dword 0x00010001 >+ > pxor mm6, mm6 ; accum > pxor mm7, mm7 ; zero > >@@ -347,13 +336,13 @@ sad8_mmx: > SAD_8x8_MMX > SAD_8x8_MMX > >- pmaddwd mm6, [mmx_one] ; collapse >+ pmaddwd mm6, [esp] ; collapse > movq mm7, mm6 > psrlq mm7, 32 > paddd mm6, mm7 > > movd eax, mm6 >- >+ add esp, byte 8 > ret > .endfunc > >@@ -377,6 +366,9 @@ sad16v_mmx: > mov ecx, [esp + 8 + 12] ; Stride > mov ebx, [esp + 8 + 16] ; sad ptr > >+ push dword 0x00010001 >+ push dword 0x00010001 >+ > pxor mm5, mm5 ; accum > pxor mm6, mm6 ; accum > pxor mm7, mm7 ; zero >@@ -390,8 +382,8 @@ sad16v_mmx: > SADV_16x16_MMX > SADV_16x16_MMX > >- pmaddwd mm5, [mmx_one] ; collapse >- pmaddwd mm6, [mmx_one] ; collapse >+ pmaddwd mm5, [esp] ; collapse >+ pmaddwd mm6, [esp] ; collapse > > movq mm2, mm5 > movq mm3, mm6 >@@ -421,8 +413,8 @@ sad16v_mmx: > SADV_16x16_MMX > SADV_16x16_MMX > >- pmaddwd mm5, [mmx_one] ; collapse >- pmaddwd mm6, [mmx_one] ; collapse >+ pmaddwd mm5, [esp] ; collapse >+ pmaddwd mm6, [esp] ; collapse > > movq mm2, mm5 > movq mm3, mm6 >@@ -442,6 +434,7 @@ sad16v_mmx: > > add eax, edi > >+ add esp, byte 8 > pop edi > pop ebx > >@@ -465,6 +458,9 @@ sad16bi_mmx: > mov ebx, [esp+4+12] ; Ref2 > mov ecx, [esp+4+16] ; Stride > >+ push dword 0x00010001 >+ push dword 0x00010001 >+ > pxor mm6, mm6 ; accum2 > pxor mm7, mm7 > .Loop >@@ -502,12 +498,13 @@ sad16bi_mmx: > SADBI_16x16_MMX 0, 0 > SADBI_16x16_MMX 8, 1 > >- pmaddwd mm6, [mmx_one] ; collapse >+ pmaddwd mm6, [esp] ; collapse > movq mm7, mm6 > psrlq mm7, 32 > paddd mm6, mm7 > > movd eax, mm6 >+ add esp, byte 8 > pop ebx > > ret >@@ -530,6 +527,9 @@ sad8bi_mmx: > mov ebx, [esp+4+12] ; Ref2 > mov ecx, [esp+4+16] ; Stride > >+ push dword 0x00010001 >+ push dword 0x00010001 >+ > pxor mm6, mm6 ; accum2 > pxor mm7, mm7 > .Loop >@@ -542,12 +542,13 @@ sad8bi_mmx: > SADBI_16x16_MMX 0, 1 > SADBI_16x16_MMX 0, 1 > >- pmaddwd mm6, [mmx_one] ; collapse >+ pmaddwd mm6, [esp] ; collapse > movq mm7, mm6 > psrlq mm7, 32 > paddd mm6, mm7 > > movd eax, mm6 >+ add esp, byte 8 > pop ebx > ret > .endfunc >@@ -568,6 +569,9 @@ dev16_mmx: > pxor mm5, mm5 ; accum1 > pxor mm6, mm6 ; accum2 > >+ push dword 0x00010001 >+ push dword 0x00010001 >+ > MEAN_16x16_MMX > MEAN_16x16_MMX > MEAN_16x16_MMX >@@ -587,7 +591,7 @@ dev16_mmx: > MEAN_16x16_MMX > > paddusw mm6, mm5 >- pmaddwd mm6, [mmx_one] ; collapse >+ pmaddwd mm6, [esp] ; collapse > movq mm5, mm6 > psrlq mm5, 32 > paddd mm6, mm5 >@@ -622,13 +626,14 @@ dev16_mmx: > ABS_16x16_MMX > ABS_16x16_MMX > >- pmaddwd mm5, [mmx_one] ; collapse >+ pmaddwd mm5, [esp] ; collapse > movq mm6, mm5 > psrlq mm6, 32 > paddd mm6, mm5 > > movd eax, mm6 > >+ add esp, byte 8 > ret > .endfunc > >@@ -747,3 +752,5 @@ sse8_8bit_mmx: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_sse2.asm xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_sse2.asm >--- xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_sse2.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_sse2.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -170,3 +170,5 @@ dev16_sse2: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_xmm.asm xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_xmm.asm >--- xvidcore-1.1.0-beta2-old/src/motion/x86_asm/sad_xmm.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/motion/x86_asm/sad_xmm.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -444,3 +444,5 @@ sad16v_xmm: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_h263_3dne.asm xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_h263_3dne.asm >--- xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_h263_3dne.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_h263_3dne.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -233,7 +233,8 @@ ALIGN 8 > movq mm3, [ebx] ;B2 > %endif > %if (%1 == 3) >- imul eax, [int_div+4*edi] >+ mov esi, [esp + 4] >+ imul eax, [esi + int_div+4*edi wrt ..gotoff] > %endif > pxor mm5, mm4 ;C7 > pxor mm7, mm6 ;D7 >@@ -313,7 +314,8 @@ ALIGN 8 > %endif > nop > %if (%1 == 3) >- imul eax, [int_div+4*edi] >+ mov esi, [esp +4] >+ imul eax, [esi + int_div+4*edi wrt ..gotoff] > %endif > pxor mm5, mm4 ;C7 > pxor mm7, mm6 ;D7 >@@ -327,21 +329,25 @@ quant_h263_intra_3dne: > mov eax, [esp + 12] ; quant > mov ecx, [esp + 8] ; data > mov edx, [esp + 4] ; coeff >+ push esi >+ push ebx >+ push edi >+ call get_pc.si >+ add esi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ push esi >+ > cmp al, 1 > pxor mm1, mm1 > pxor mm3, mm3 > movq mm0, [ecx] ; mm0 = [1st] > movq mm2, [ecx + 8] >- push esi >- lea esi, [mmx_div + eax*8 - 8] >+ mov ebx, [esi + mmzero wrt ..gotoff] >+ lea esi, [esi + mmx_div + eax*8 - 8 wrt ..gotoff] > >- push ebx >- mov ebx, mmzero >- push edi > jz near .q1loop > > quant_intra 0 >- mov ebp, [esp + 16 + 16] ; dcscalar >+ mov ebp, [esp + 20 + 16] ; dcscalar > ; NB -- there are 3 pushes in the function preambule and one more > ; in "quant_intra 0", thus an added offset of 16 bytes > movsx eax, word [byte ecx] ; DC >@@ -354,20 +360,20 @@ quant_h263_intra_3dne: > quant_intra 2 > sub eax, edi ; DC (+1) > xor ebp, edi ; sign(DC) dcscalar /2 (-1) >- mov edi, [esp + 16 + 16] ; dscalar >+ mov edi, [esp + 20 + 16] ; dscalar > lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 > mov ebp, [byte esp] > > quant_intra 3 > psubw mm5, mm4 ;C8 >- mov esi, [esp + 12] ; pop back the register value >- mov edi, [esp + 4] ; pop back the register value >+ mov esi, [esp + 16] ; pop back the register value >+ mov edi, [esp + 8] ; pop back the register value > sar eax, 16 > lea ebx, [byte eax + 1] ; workaround for eax < 0 > cmovs eax, ebx ; conditionnaly move the corrected value > mov [edx], ax ; coeff[0] = ax >- mov ebx, [esp + 8] ; pop back the register value >- add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 >+ mov ebx, [esp + 12] ; pop back the register value >+ add esp, byte 20 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 > psubw mm7, mm6 ;D8 > movq [edx + 3 * 32 + 16], mm5 ;C9 > movq [edx + 3 * 32 + 24], mm7 ;D9 >@@ -379,7 +385,7 @@ ALIGN 16 > > .q1loop > quant_intra1 0 >- mov ebp, [esp + 16 + 16] ; dcscalar >+ mov ebp, [esp + 20 + 16] ; dcscalar > movsx eax, word [byte ecx] ; DC > > quant_intra1 1 >@@ -390,20 +396,20 @@ ALIGN 16 > quant_intra1 2 > sub eax, edi ; DC (+1) > xor ebp, edi ; sign(DC) dcscalar /2 (-1) >- mov edi, [esp + 16 + 16] ; dcscalar >+ mov edi, [esp + 20 + 16] ; dcscalar > lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 > mov ebp, [byte esp] > > quant_intra1 3 > psubw mm5, mm4 ;C8 >- mov esi, [dword esp + 12] ; pop back the register value >- mov edi, [esp + 4] ; pop back the register value >+ mov esi, [dword esp + 16] ; pop back the register value >+ mov edi, [esp + 8] ; pop back the register value > sar eax, 16 > lea ebx, [byte eax + 1] ; workaround for eax < 0 > cmovs eax, ebx ; conditionnaly move the corrected value > mov [edx], ax ; coeff[0] = ax >- mov ebx, [esp + 8] ; pop back the register value >- add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 >+ mov ebx, [esp + 12] ; pop back the register value >+ add esp, byte 20 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 > psubw mm7, mm6 ;D8 > movq [edx + 3 * 32 + 16], mm5 ;C9 > movq [edx + 3 * 32 + 24], mm7 ;D9 >@@ -505,13 +511,18 @@ quant_h263_inter_3dne: > mov eax, [esp + 12] ; quant > push ebx > >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ push ebx >+ > pxor mm5, mm5 ; sum > nop >- lea ebx,[mmx_sub + eax * 8 - 8] ; sub >- movq mm7, [mmx_div + eax * 8 - 8] ; divider >+ movq mm7, [ebx + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider >+ lea ebx,[ebx + mmx_sub + eax * 8 - 8 wrt ..gotoff] ; sub > > cmp al, 1 >- lea eax, [mmzero] >+ mov eax, [esp] >+ lea eax, [eax + mmzero wrt ..gotoff] > jz near .q1loop > cmp esp, esp > ALIGN 8 >@@ -535,14 +546,15 @@ ALIGN 8 > pxor mm4, mm3 ;B9 > psubw mm4, mm3 ;B10 > movq [edx + 4*24+16], mm2 ;C11 >- pop ebx > movq [edx + 4*24+8], mm4 ;B11 >- pmaddwd mm5, [plus_one] >+ pop ebx >+ pmaddwd mm5, [ebx + plus_one wrt ..gotoff] > movq mm0, mm5 > punpckhdq mm5, mm5 > paddd mm0, mm5 > movd eax, mm0 ; return sum > >+ pop ebx > ret > > ALIGN 16 >@@ -558,7 +570,8 @@ ALIGN 16 > quantinter1 6 > quantinter1 7 > >- pmaddwd mm5, [plus_one] >+ pop ebx >+ pmaddwd mm5, [ebx + plus_one wrt ..gotoff] > movq mm0, mm5 > psrlq mm5, 32 > paddd mm0, mm5 >@@ -658,23 +671,29 @@ dequant_h263_intra_3dne: > pxor mm2, mm2 > push edi > push ebx >- lea edi, [mmx_mul + eax*8 - 8] ; 2*quant > push ebp >- mov ebx, mmx_2047 >- movsx ebp, word [ecx] >- lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 > push esi >- mov esi, mmzero >+ >+ call get_pc.di >+ add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ push edi >+ >+ lea edi, [edi + mmx_mul + eax*8 - 8 wrt ..gotoff] ; 2*quant >+ mov esi, [esp] >+ lea ebx, [esi + mmx_2047 wrt ..gotoff] >+ movsx ebp, word [ecx] >+ lea eax, [esi + mmx_add + eax*8 - 8 wrt ..gotoff] ; quant or quant-1 >+ lea esi, [esi + mmzero wrt ..gotoff] > pxor mm7, mm7 > movq mm3, [ecx+120] ;B2 ; c = coeff[i] > pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) > >- imul ebp, [esp+16+16] ; dcscalar >+ imul ebp, [esp+16+20] ; dcscalar > psubw mm2, mm3 ;-c ;B3 (1st dep) > pmaxsw mm2, mm3 ;|c| ;B4 (2nd) > pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) > psraw mm3, 15 ; sign(c) ;B7 (2nd) >- mov edx, [esp+ 4+16] ; data >+ mov edx, [esp+ 4+20] ; data > > ALIGN 8 > dequant 0 >@@ -684,7 +703,8 @@ ALIGN 8 > > dequant 1 > >- cmovl ebp, [int_2048] >+ mov ebp, [esp] >+ cmovl ebp, [ebp + int_2048 wrt ..gotoff] > nop > > dequant 2 >@@ -694,7 +714,8 @@ ALIGN 8 > > dequant 3 > >- cmovg ebp, [int2047] >+ mov ebp, [esp] >+ cmovg ebp, [ebp + int2047 wrt ..gotoff] > nop > > dequant 4 >@@ -703,16 +724,16 @@ ALIGN 8 > pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) > pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) > mov eax, ebp >- mov esi, [esp] >- mov ebp, [esp+4] >+ mov esi, [esp+4] >+ mov ebp, [esp+8] > pxor mm5, mm4 ;C13 (6th+) > paddw mm7, mm3 ;B10 offset +negate back (3rd) > movq [edx+4*24+16], mm5 ;C14 (7th) > paddw mm2, mm7 ;B11 mm7 free (4th+) > pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) >- mov ebx, [esp+8] >- mov edi, [esp+12] >- add esp, byte 16 >+ mov ebx, [esp+12] >+ mov edi, [esp+16] >+ add esp, byte 20 > pxor mm3, mm2 ;B13 (6th+) > movq [edx+4*24+8], mm3 ;B14 (7th) > mov [edx], ax >@@ -721,6 +742,20 @@ ALIGN 8 > ret > .endfunc > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc >+.bx: >+ mov ebx, [esp] >+ retn >+ >+.si: >+ mov esi, [esp] >+ retn >+ >+.di: >+ mov edi, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; > ; uint32_t dequant_h263_inter_3dne(int16_t * data, >@@ -744,18 +779,24 @@ dequant_h263_inter_3dne: > push edi > push ebx > push esi >- lea edi, [mmx_mul + eax*8 - 8] ; 2*quant >- mov ebx, mmx_2047 >+ >+ call get_pc.di >+ add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ push edi >+ >+ mov ebx, [edi + mmx_2047 wrt ..gotoff] >+ lea edi, [edi + mmx_mul + eax*8 - 8 wrt ..gotoff] ; 2*quant > pxor mm7, mm7 > movq mm3, [ecx+120] ;B2 ; c = coeff[i] > pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) >- lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 >+ mov esi, [esp] >+ lea eax, [esi + mmx_add + eax*8 - 8 wrt ..gotoff] ; quant or quant-1 > psubw mm2, mm3 ;-c ;B3 (1st dep) >- mov esi, mmzero >+ lea esi, [esi + mmzero wrt ..gotoff] > pmaxsw mm2, mm3 ;|c| ;B4 (2nd) > pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) > psraw mm3, 15 ; sign(c) ;B7 (2nd) >- mov edx, [dword esp+ 4+12] ; data >+ mov edx, [dword esp+ 4+16] ; data > > ALIGN 8 > >@@ -768,15 +809,15 @@ ALIGN 8 > paddw mm4, mm6 ;C11 mm6 free (4th+) > pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) > pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) >- mov esi, [esp] >+ mov esi, [esp+4] > pxor mm5, mm4 ;C13 (6th+) > paddw mm7, mm3 ;B10 offset +negate back (3rd) > movq [edx+4*24+16], mm5 ;C14 (7th) > paddw mm2, mm7 ;B11 mm7 free (4th+) > pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) >- mov ebx, [esp+4] >- mov edi, [esp+8] >- add esp, byte 12 >+ mov ebx, [esp+8] >+ mov edi, [esp+12] >+ add esp, byte 16 > pxor mm3, mm2 ;B13 (6th+) > movq [edx+4*24+8], mm3 ;B14 (7th) > >@@ -784,3 +825,5 @@ ALIGN 8 > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_h263_mmx.asm xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_h263_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_h263_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_h263_mmx.asm 2005-10-23 18:42:40.000000000 +0200 >@@ -139,9 +139,10 @@ ALIGN 16 > quant_h263_intra_mmx: > > push esi >+ push edi > >- mov esi, [esp + 4 + 8] ; data >- mov ecx,[esp + 4 + 16] ; dcscalar >+ mov esi, [esp + 8 + 8] ; data >+ mov ecx,[esp + 8 + 16] ; dcscalar > movsx eax, word [esi] ; data[0] > > sar ecx,1 ; dcscalar /2 >@@ -151,14 +152,17 @@ quant_h263_intra_mmx: > sub eax,edx > add eax,ecx ; + (dcscalar/2)*sgn(data[0]) > >- mov ecx, [esp + 4 + 12] ; quant >+ mov ecx, [esp + 8 + 12] ; quant > cdq >- idiv dword [esp + 4 + 16] ; dcscalar >+ idiv dword [esp + 8 + 16] ; dcscalar > cmp ecx, 1 >- mov edx, [esp + 4 + 4] ; coeff >+ mov edx, [esp + 8 + 4] ; coeff > je .low >- >- movq mm7, [mmx_div+ecx * 8 - 8] >+ >+ call get_pc.di >+ add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm7, [edi + mmx_div+ecx * 8 - 8 wrt ..gotoff] > mov ecx,4 > > .loop >@@ -228,10 +232,11 @@ quant_h263_intra_mmx: > jne .loop_low > > .end >- mov edx, [esp + 4 + 4] ; coeff >+ mov edx, [esp + 8 + 4] ; coeff > mov [edx],ax > xor eax,eax ; return 0 > >+ pop edi > pop esi > ret > .endfunc >@@ -251,23 +256,28 @@ ALIGN 16 > quant_h263_intra_sse2: > > push esi >+ push edi > >- mov esi, [esp + 4 + 8] ; data >+ mov esi, [esp + 8 + 8] ; data > > movsx eax, word [esi] ; data[0] > >- mov ecx,[esp + 4 + 16] ; dcscalar >+ mov ecx,[esp + 8 + 16] ; dcscalar > mov edx,eax > sar ecx,1 > add eax,ecx > sub edx,ecx > cmovl eax,edx ; +/- dcscalar/2 >- mov ecx, [esp + 4 + 12] ; quant >+ mov ecx, [esp + 8 + 12] ; quant > cdq >- idiv dword [esp + 4 + 16] ; dcscalar >+ idiv dword [esp + 8 + 16] ; dcscalar > cmp ecx, 1 >- mov edx, [esp + 4 + 4] ; coeff >- movq xmm7, [mmx_div+ecx * 8 - 8] >+ mov edx, [esp + 8 + 4] ; coeff >+ >+ call get_pc.di >+ add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq xmm7, [edi + mmx_div+ecx * 8 - 8 wrt ..gotoff] > je .low > > mov ecx,2 >@@ -340,10 +350,11 @@ quant_h263_intra_sse2: > jne .loop_low > > .end >- mov edx, [esp + 4 + 4] ; coeff >+ mov edx, [esp + 8 + 4] ; coeff > mov [edx],ax > xor eax,eax ; return 0 > >+ pop edi > pop esi > ret > .endfunc >@@ -370,13 +381,16 @@ quant_h263_inter_mmx: > > xor ecx, ecx > >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > pxor mm5, mm5 ; sum >- movq mm6, [mmx_sub + eax * 8 - 8] ; sub >+ movq mm6, [edx + mmx_sub + eax * 8 - 8 wrt ..gotoff] ; sub > > cmp al, 1 > jz .q1loop > >- movq mm7, [mmx_div + eax * 8 - 8] ; divider >+ movq mm7, [edx + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider > > ALIGN 8 > .loop >@@ -408,7 +422,7 @@ ALIGN 8 > jnz .loop > > .done >- pmaddwd mm5, [plus_one] >+ pmaddwd mm5, [edx + plus_one wrt ..gotoff] > movq mm0, mm5 > psrlq mm5, 32 > paddd mm0, mm5 >@@ -477,7 +491,10 @@ quant_h263_inter_sse2: > > pxor xmm5, xmm5 ; sum > >- movq mm0, [mmx_sub + eax*8 - 8] ; sub >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm0, [edx + mmx_sub + eax*8 - 8 wrt ..gotoff] ; sub > movq2dq xmm6, mm0 ; load into low 8 bytes > movlhps xmm6, xmm6 ; duplicate into high 8 bytes > >@@ -485,7 +502,7 @@ quant_h263_inter_sse2: > jz near .qes2_q1loop > > .qes2_not1 >- movq mm0, [mmx_div + eax*8 - 8] ; divider >+ movq mm0, [edx + mmx_div + eax*8 - 8 wrt ..gotoff] ; divider > movq2dq xmm7, mm0 > movlhps xmm7, xmm7 > >@@ -519,7 +536,7 @@ ALIGN 16 > jnz .qes2_loop > > .qes2_done >- movdqu xmm6, [plus_one] >+ movdqu xmm6, [edx + plus_one wrt ..gotoff] > pmaddwd xmm5, xmm6 > movhlps xmm6, xmm5 > paddd xmm5, xmm6 >@@ -583,8 +600,12 @@ dequant_h263_intra_mmx: > > mov ecx, [esp+12] ; quant > mov eax, [esp+ 8] ; coeff >+ >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > pcmpeqw mm0,mm0 >- movq mm6, [mmx_quant + ecx*8] ; quant >+ movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant > shl ecx,31 ; quant & 1 ? 0 : - 1 > movq mm7,mm6 > movq mm5,mm0 >@@ -841,8 +862,12 @@ dequant_h263_inter_mmx: > > mov ecx, [esp+12] ; quant > mov eax, [esp+ 8] ; coeff >+ >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > pcmpeqw mm0,mm0 >- movq mm6, [mmx_quant + ecx*8] ; quant >+ movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant > shl ecx,31 ; odd/even > movq mm7,mm6 > movd mm1,ecx >@@ -912,8 +937,12 @@ dequant_h263_inter_xmm: > > mov ecx, [esp+12] ; quant > mov eax, [esp+ 8] ; coeff >+ >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > pcmpeqw mm0,mm0 >- movq mm6, [mmx_quant + ecx*8] ; quant >+ movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant > shl ecx,31 > movq mm5,mm0 > movd mm1,ecx >@@ -967,7 +996,16 @@ dequant_h263_inter_xmm: > ret > .endfunc > >- >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc >+.di: >+ mov edi, [esp] >+ retn >+ >+.dx: >+ mov edx, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; > ; uint32_t dequant_h263_inter_sse2(int16_t * data, >@@ -983,7 +1021,10 @@ dequant_h263_inter_sse2: > mov ecx, [esp+12] ; quant > mov eax, [esp+ 8] ; coeff > >- movq xmm6, [mmx_quant + ecx*8] ; quant >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq xmm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant > inc ecx > pcmpeqw xmm5,xmm5 > and ecx,1 >@@ -1039,3 +1080,5 @@ dequant_h263_inter_sse2: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_mpeg_mmx.asm xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_mpeg_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_mpeg_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_mpeg_mmx.asm 2005-10-23 18:42:41.000000000 +0200 >@@ -162,7 +162,11 @@ quant_mpeg_intra_mmx: > mov eax, [esp + 16 + 12] ; quant > mov ebx, [esp + 16 + 20] ; mpeg_quant_matrices > >- movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5 >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm5, [ebp + quantd + eax * 8 - 8 wrt ..gotoff] ; quantd -> mm5 > > xor ecx, ecx > cmp al, 1 >@@ -171,7 +175,7 @@ quant_mpeg_intra_mmx: > cmp al, 2 > jz near .q2loop > >- movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7 >+ movq mm7, [ebp + mmx_div + eax * 8 - 8 wrt ..gotoff] ; multipliers[quant] -> mm7 > > ALIGN 16 > .loop >@@ -234,6 +238,7 @@ ALIGN 16 > > mov [edi], ax ; coeff[0] = ax > >+ pop ebp > pop ebx > pop edi > pop esi >@@ -346,6 +351,10 @@ quant_mpeg_inter_mmx: > mov eax, [esp + 16 + 12] ; quant > mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices > >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ > xor ecx, ecx > > pxor mm5, mm5 ; sum >@@ -356,7 +365,7 @@ quant_mpeg_inter_mmx: > cmp al, 2 > jz near .q2loop > >- movq mm7, [mmx_div + eax * 8 - 8] ; divider >+ movq mm7, [ebp + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider > > ALIGN 16 > .loop >@@ -400,17 +409,17 @@ ALIGN 16 > jnz near .loop > > .done >- pmaddwd mm5, [mmx_one] >+ pmaddwd mm5, [ebp + mmx_one wrt ..gotoff] > movq mm0, mm5 > psrlq mm5, 32 > paddd mm0, mm5 > movd eax, mm0 ; return sum > >+ pop ebp > pop ebx > pop edi > pop esi > pop ecx >- > ret > > ALIGN 16 >@@ -556,7 +565,11 @@ dequant_mpeg_intra_mmx: > mov eax, [esp + 4 + 12] ; quant > mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices > >- movq mm7, [mmx_mul_quant + eax*8 - 8] >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm7, [ebp + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] > mov eax, -16 ; to keep ALIGNed, we regularly process coeff[0] > psllw mm7, 2 ; << 2. See comment. > pxor mm6, mm6 ; this is a NOP >@@ -595,11 +608,11 @@ ALIGN 16 > pmullw mm6, mm3 ; low of coeff*(matrix*quant) > pmulhw mm3, mm5 ; high of coeff*(matrix*quant) > >- pcmpgtw mm0, [zero] >+ pcmpgtw mm0, [ebp + zero wrt ..gotoff] > paddusw mm2, mm0 > psrlw mm2, 5 > >- pcmpgtw mm3, [zero] >+ pcmpgtw mm3, [ebp + zero wrt ..gotoff] > paddusw mm6, mm3 > psrlw mm6, 5 > >@@ -620,22 +633,28 @@ ALIGN 16 > ; deal with DC > movd mm0, [ecx] > pmullw mm0, [esp + 4 + 16] ; dcscalar >- movq mm2, [mmx_32767_minus_2047] >+ movq mm2, [ebp + mmx_32767_minus_2047 wrt ..gotoff] > paddsw mm0, mm2 > psubsw mm0, mm2 >- movq mm2, [mmx_32768_minus_2048] >+ movq mm2, [ebp + mmx_32768_minus_2048 wrt ..gotoff] > psubsw mm0, mm2 > paddsw mm0, mm2 > movd eax, mm0 > mov [edx], ax > > xor eax, eax >- >+ >+ pop ebp > pop ebx > > ret > .endfunc > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bp: >+ mov ebp, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; > ; uint32_t dequant_mpeg_inter_mmx(int16_t * data, >@@ -660,7 +679,11 @@ dequant_mpeg_inter_mmx: > mov eax, [esp + 4 + 12] ; quant > mov ebx, [esp + 4 + 16] ; mpeg_quant_matrices > >- movq mm7, [mmx_mul_quant + eax*8 - 8] >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm7, [ebp + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] > mov eax, -16 > paddw mm7, mm7 ; << 1 > pxor mm6, mm6 ; mismatch sum >@@ -702,7 +725,7 @@ ALIGN 16 > movq mm4, mm7 ; (matrix*quant) > pmullw mm4, [ebx + 512 + 8*eax + 8*16 -2*8 + 8] > >- pcmpgtw mm5, [zero] >+ pcmpgtw mm5, [ebp + zero wrt ..gotoff] > paddusw mm0, mm5 > psrlw mm0, 5 > pxor mm0, mm1 ; start restoring sign >@@ -713,7 +736,7 @@ ALIGN 16 > pmullw mm2, mm4 ; low of c*(matrix*quant) > psubw mm0, mm1 ; finish restoring sign > >- pcmpgtw mm5, [zero] >+ pcmpgtw mm5, [ebp + zero wrt ..gotoff] > paddusw mm2, mm5 > psrlw mm2, 5 > pxor mm2, mm3 ; start restoring sign >@@ -744,9 +767,12 @@ ALIGN 16 > xor word [edx + 2*63], ax > > xor eax, eax >- >+ >+ pop ebp > pop ebx > > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_mpeg_xmm.asm xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_mpeg_xmm.asm >--- xvidcore-1.1.0-beta2-old/src/quant/x86_asm/quantize_mpeg_xmm.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/quant/x86_asm/quantize_mpeg_xmm.asm 2005-10-23 18:42:41.000000000 +0200 >@@ -188,8 +188,12 @@ quant_mpeg_intra_xmm: > push esi > push edi > push ebx >- nop >- mov edi, [esp + 12 + 20] ; mpeg_quant_matrices >+ >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov edi, [esp + 16 + 20] ; mpeg_quant_matrices > mov esi, -14 > pxor mm0, mm0 > pxor mm3, mm3 >@@ -226,8 +230,8 @@ ALIGN 16 > psubw mm0, mm2 > psubw mm3, mm6 > nop4 >- movq mm2, [quantd + ecx * 8 - 8] >- movq mm6, [mmx_divs + ecx * 8 - 8] >+ movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff] >+ movq mm6, [ebp + mmx_divs + ecx * 8 - 8 wrt ..gotoff] > paddw mm5, mm2 > paddw mm7, mm2 > mov esp, esp >@@ -250,27 +254,28 @@ ALIGN 16 > > .done > ; calculate data[0] // (int32_t)dcscalar) >- mov esi, [esp + 12 + 16] ; dcscalar >+ mov esi, [esp + 16 + 16] ; dcscalar > movsx ecx, word [eax] > mov edi, ecx >- mov edx, [esp + 12 + 16] >+ mov edx, [esp + 16 + 16] > shr edx, 1 ; ebx = dcscalar /2 > sar edi, 31 ; cdq is vectorpath > xor edx, edi ; ebx = eax V -eax -1 > sub ecx, edi > add ecx, edx >- mov edx, [dword esp + 12 + 4] >- mov esi, [int_div+4*esi] >+ mov edx, [dword esp + 16 + 4] >+ mov esi, [ebp + int_div+4*esi wrt ..gotoff] > imul ecx, esi > sar ecx, 17 > lea ebx, [byte ecx + 1] > cmovs ecx, ebx > ; idiv cx ; ecx = edi:ecx / dcscalar > >- mov ebx, [esp] >- mov edi, [esp+4] >- mov esi, [esp+8] >- add esp, byte 12 >+ mov ebp, [esp] >+ mov ebx, [esp+4] >+ mov edi, [esp+8] >+ mov esi, [esp+12] >+ add esp, byte 16 > mov [edx], cx ; coeff[0] = ax > > xor eax, eax >@@ -303,7 +308,7 @@ ALIGN 16 > psubw mm0, mm2 ;mismatch > psubw mm3, mm6 > nop4 >- movq mm2, [quantd + ecx * 8 - 8] >+ movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff] > paddw mm5, mm2 ;first approx with quantd > paddw mm7, mm2 > mov esp, esp >@@ -353,8 +358,8 @@ ALIGN 8 > psubw mm0, mm2 ;mismatch > psubw mm3, mm6 > nop4 >- movq mm2, [quantd + ecx * 8 - 8] >- movq mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16 >+ movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff] >+ movq mm6, [ebp + mmx_div + ecx * 8 - 8 wrt ..gotoff] ; divs for q<=16 > paddw mm5, mm2 ;first approx with quantd > paddw mm7, mm2 > mov esp, esp >@@ -397,8 +402,12 @@ quant_mpeg_inter_xmm: > push esi > push edi > push ebx >- nop >- mov edi, [esp + 12 + 16] >+ >+ push ebp >+ call get_pc.bp >+ add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov edi, [esp + 16 + 16] > mov esi, -14 > mov ebx, esp > sub esp, byte 24 >@@ -440,8 +449,8 @@ ALIGN 16 > pmullw mm6, mm7 > psubw mm0, mm2 > psubw mm3, mm6 >- movq mm2, [byte ebx] >- movq mm6, [mmx_divs + ecx * 8 - 8] >+ movq mm2, [ebp + ebx wrt ..gotoff] >+ movq mm6, [ebp + mmx_divs + ecx * 8 - 8 wrt ..gotoff] > pmulhuw mm0, [edi + 768 + 8*esi+112] > pmulhuw mm3, [edi + 768 + 8*esi+120] > paddw mm2, [ebx+8] ;sum >@@ -466,11 +475,12 @@ ALIGN 16 > .done > ; calculate data[0] // (int32_t)dcscalar) > paddw mm2, [ebx] >- mov ebx, [esp+24] >- mov edi, [esp+4+24] >- mov esi, [esp+8+24] >- add esp, byte 12+24 >- pmaddwd mm2, [mmx_one] >+ mov ebx, [esp+4+24] >+ mov edi, [esp+8+24] >+ mov esi, [esp+12+24] >+ pmaddwd mm2, [ebp + mmx_one wrt ..gotoff] >+ mov ebp, [esp+24] >+ add esp, byte 16+24 > punpckldq mm0, mm2 ;get low dw to mm0:high > paddd mm0,mm2 > punpckhdq mm0, mm0 ;get result to low >@@ -554,7 +564,7 @@ ALIGN 8 > psubw mm0,mm2 ;mismatch > psubw mm3,mm6 > movq mm2,[byte ebx] >- movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16 >+ movq mm6,[ebp + mmx_div + ecx * 8 - 8 wrt ..gotoff] ; divs for q<=16 > pmulhuw mm0,[edi + 768 + 8*esi+112] ;correction > pmulhuw mm3,[edi + 768 + 8*esi+120] > paddw mm2,[ebx+8] ;sum >@@ -644,7 +654,11 @@ ALIGN 16 > dequant_mpeg_intra_3dne: > mov eax, [esp+12] ; quant > mov ecx, [esp+8] ; coeff >- movq mm7, [mmx_mul_quant + eax*8 - 8] >+ >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm7, [edx + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] > psllw mm7, 2 ; << 2. See comment. > mov edx, [esp+4] ; data > push ebx >@@ -700,6 +714,16 @@ ALIGN 4 > ret > .endfunc > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc >+.dx: >+ mov edx, [esp] >+ retn >+ >+.bp: >+ mov ebp, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; > ; uint32_t dequant_mpeg_inter_3dne(int16_t * data, >@@ -716,16 +740,20 @@ ALIGN 4 > > ALIGN 16 > dequant_mpeg_inter_3dne: >- mov edx, [esp+ 4] ; data > mov ecx, [esp+ 8] ; coeff > mov eax, [esp+12] ; quant >- movq mm7, [mmx_mul_quant + eax*8 - 8] >+ >+ call get_pc.dx >+ add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ movq mm7, [edx + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] > mov eax, -14 > paddw mm7, mm7 ; << 1 > pxor mm6, mm6 ; mismatch sum > push esi > push edi >- mov esi, mmzero >+ mov esi, [edx + mmzero wrt ..gotoff] >+ mov edx, [esp + 8 + 4] ; data > pxor mm1, mm1 > pxor mm3, mm3 > mov edi, [esp + 8 + 16] ; mpeg_quant_matrices >@@ -815,3 +843,5 @@ ALIGN 16 > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/utils/x86_asm/cpuid.asm xvidcore-1.1.0-beta2/src/utils/x86_asm/cpuid.asm >--- xvidcore-1.1.0-beta2-old/src/utils/x86_asm/cpuid.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/utils/x86_asm/cpuid.asm 2005-10-23 18:42:41.000000000 +0200 >@@ -66,20 +66,6 @@ BITS 32 > %define XVID_CPU_TSC (1<< 6) > > ;============================================================================= >-; Read only data >-;============================================================================= >- >-ALIGN 32 >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-vendorAMD: >- db "AuthenticAMD" >- >-;============================================================================= > ; Macros > ;============================================================================= > >@@ -161,11 +147,11 @@ check_cpu_features: > cpuid > > ; AMD cpu ? >- lea esi, [vendorAMD] >- lea edi, [esp] >- mov ecx, 12 >- cld >- repe cmpsb >+ cmp dword [esp],"Auth" >+ jnz .cpu_quit >+ cmp dword [esp+4],"enti" >+ jnz .cpu_quit >+ cmp dword [esp+8],"cAMD" > jnz .cpu_quit > > ; 3DNow! support ? >@@ -225,4 +211,5 @@ emms_3dn: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits > >diff -urp xvidcore-1.1.0-beta2-old/src/utils/x86_asm/interlacing_mmx.asm xvidcore-1.1.0-beta2/src/utils/x86_asm/interlacing_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/utils/x86_asm/interlacing_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/utils/x86_asm/interlacing_mmx.asm 2005-10-23 18:42:41.000000000 +0200 >@@ -129,6 +129,11 @@ cglobal MBFieldTest_mmx > paddw mm7, mm3 > %endmacro > >+extern _GLOBAL_OFFSET_TABLE_ >+get_pc.bx: >+ mov ebx, [esp] >+ retn >+ > ;----------------------------------------------------------------------------- > ; > ; uint32_t MBFieldTest_mmx(int16_t * const data); >@@ -141,7 +146,11 @@ MBFieldTest_mmx: > push esi > push edi > >- mov esi, [esp+8+4] ; esi = top left block >+ push ebx >+ call get_pc.bx >+ add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc >+ >+ mov esi, [esp+12+4] ; esi = top left block > mov edi, esi > add edi, 256 ; edi = bottom left block > >@@ -184,7 +193,7 @@ MBFieldTest_mmx: > psubw m14, mm4 > paddw mm6, m14 ; add to frame total > >- mov ecx, [nexts+eax*4] ; move esi/edi 8 pixels to the right >+ mov ecx, [ebx+nexts+eax*4 wrt ..gotoff] ; move esi/edi 8 pixels to the right > add esi, ecx > add edi, ecx > >@@ -192,7 +201,7 @@ MBFieldTest_mmx: > jnz near .loop > > .decide: >- movq mm0, [ones] ; add packed words into single dwords >+ movq mm0, [ebx + ones wrt ..gotoff] ; add packed words into single dwords > pmaddwd mm6, mm0 > pmaddwd mm7, mm0 > >@@ -211,9 +220,12 @@ MBFieldTest_mmx: > inc eax ; if frame>=field, use field dct (return 1) > > .end: >+ pop ebx > pop edi > pop esi > > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/utils/x86_asm/mem_transfer_3dne.asm xvidcore-1.1.0-beta2/src/utils/x86_asm/mem_transfer_3dne.asm >--- xvidcore-1.1.0-beta2-old/src/utils/x86_asm/mem_transfer_3dne.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/utils/x86_asm/mem_transfer_3dne.asm 2005-10-23 18:42:41.000000000 +0200 >@@ -437,3 +437,5 @@ transfer8x8_copy_3dne: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+ >diff -urp xvidcore-1.1.0-beta2-old/src/utils/x86_asm/mem_transfer_mmx.asm xvidcore-1.1.0-beta2/src/utils/x86_asm/mem_transfer_mmx.asm >--- xvidcore-1.1.0-beta2-old/src/utils/x86_asm/mem_transfer_mmx.asm 2005-04-03 22:39:44.000000000 +0200 >+++ xvidcore-1.1.0-beta2/src/utils/x86_asm/mem_transfer_mmx.asm 2005-10-23 18:42:41.000000000 +0200 >@@ -46,20 +46,6 @@ BITS 32 > %endmacro > > ;============================================================================= >-; Read only data >-;============================================================================= >- >-%ifdef FORMAT_COFF >-SECTION .rodata >-%else >-SECTION .rodata align=16 >-%endif >- >-ALIGN 16 >-mmx_one: >- dw 1, 1, 1, 1 >- >-;============================================================================= > ; Code > ;============================================================================= > >@@ -259,8 +245,8 @@ transfer_8to16subro_mmx: > punpckhbw mm3, mm7 > paddusw mm4, mm1 > paddusw mm6, mm3 >- paddusw mm4, [mmx_one] >- paddusw mm6, [mmx_one] >+ paddusw mm4, [esp] >+ paddusw mm6, [esp] > psrlw mm4, 1 > psrlw mm6, 1 > packuswb mm4, mm6 >@@ -277,8 +263,8 @@ transfer_8to16subro_mmx: > punpckhbw mm3, mm7 > paddusw mm5, mm1 > paddusw mm6, mm3 >- paddusw mm5, [mmx_one] >- paddusw mm6, [mmx_one] >+ paddusw mm5, [esp] >+ paddusw mm6, [esp] > lea esi, [esi+2*edx] > psrlw mm5, 1 > psrlw mm6, 1 >@@ -322,10 +308,13 @@ transfer_8to16sub2_mmx: > mov edx, [esp+8+20] ; Stride > pxor mm7, mm7 > >+ push dword 0x00010001 >+ push dword 0x00010001 > COPY_8_TO_16_SUB2_MMX 0 > COPY_8_TO_16_SUB2_MMX 1 > COPY_8_TO_16_SUB2_MMX 2 > COPY_8_TO_16_SUB2_MMX 3 >+ add esp, byte 8 > > pop esi > pop ebx >@@ -546,3 +535,5 @@ transfer8x8_copy_mmx: > ret > .endfunc > >+section .note.GNU-stack noalloc noexec nowrite progbits >+
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Diff
View Attachment As Raw
Actions:
View
|
Diff
Attachments on
bug 90287
:
71295
|
74905
|
80284