diff -urp xvidcore-1.1.0-old/build/generic/configure.in xvidcore-1.1.0/build/generic/configure.in --- xvidcore-1.1.0-old/build/generic/configure.in 2006-02-19 01:39:46.000000000 +0100 +++ xvidcore-1.1.0/build/generic/configure.in 2006-02-19 01:49:04.000000000 +0100 @@ -349,11 +349,11 @@ if test "$ARCHITECTURE" = "IA32" -o "$AR chosen_asm_prog="" dnl Check for yasm first - AC_CHECK_PROG([ac_yasm], [$yasm_prog], [yes], [no], , [yes]) - if test "$ac_yasm" = "yes" ; then - found_nasm_comp_prog="yes" - chosen_asm_prog="$yasm_prog" - fi +dnl AC_CHECK_PROG([ac_yasm], [$yasm_prog], [yes], [no], , [yes]) +dnl if test "$ac_yasm" = "yes" ; then +dnl found_nasm_comp_prog="yes" +dnl chosen_asm_prog="$yasm_prog" +dnl fi dnl if yasm hasn't been found, then check for nasm (not buggy version) if test "$found_nasm_comp_prog" = "no" -a "$ARCHITECTURE" != "X86_64" ; then diff -urp xvidcore-1.1.0-old/src/bitstream/x86_asm/cbp_mmx.asm xvidcore-1.1.0/src/bitstream/x86_asm/cbp_mmx.asm --- xvidcore-1.1.0-old/src/bitstream/x86_asm/cbp_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/bitstream/x86_asm/cbp_mmx.asm 2006-02-19 02:05:36.000000000 +0100 @@ -50,23 +50,6 @@ BITS 32 %endmacro ;============================================================================= -; Local data -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 - -mult_mask: - db 0x10,0x20,0x04,0x08,0x01,0x02,0x00,0x00 -ignore_dc: - dw 0, -1, -1, -1 - -;============================================================================= ; Code ;============================================================================= @@ -91,7 +74,12 @@ ALIGN 16 calc_cbp_mmx: mov eax, [esp + 4] ; coeff - movq mm7, [ignore_dc] + push byte 0 ; align esp to 8 bytes + push byte -1 + push dword 0xFFFF0000 + movq mm7, [esp] + add esp, byte 8 + pxor mm6, mm6 ; used only for comparing movq mm0, [eax+128*0] movq mm1, [eax+128*1] @@ -123,7 +111,11 @@ calc_cbp_mmx: MAKE_LOAD 13 MAKE_LOAD 14 - movq mm7, [mult_mask] + push dword 0x00000201 + push dword 0x08042010 + movq mm7, [esp] + add esp, byte 12 + packssdw mm0, mm1 packssdw mm2, mm3 packssdw mm4, mm5 diff -urp xvidcore-1.1.0-old/src/bitstream/x86_asm/cbp_sse2.asm xvidcore-1.1.0/src/bitstream/x86_asm/cbp_sse2.asm --- xvidcore-1.1.0-old/src/bitstream/x86_asm/cbp_sse2.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/bitstream/x86_asm/cbp_sse2.asm 2006-02-19 01:49:08.000000000 +0100 @@ -69,20 +69,6 @@ BITS 32 %endmacro ;============================================================================= -; Data (Read Only) -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 -ignore_dc: - dw 0, -1, -1, -1, -1, -1, -1, -1 - -;============================================================================= ; Code ;============================================================================= @@ -98,7 +84,13 @@ calc_cbp_sse2: mov edx, [esp+4] ; coeff[] xor eax, eax ; cbp = 0 - movdqu xmm7, [ignore_dc] ; mask to ignore dc value + sub esp,byte 12 ; align esp to 16 bytes + push byte -1 + push byte -1 + push byte -1 + push dword 0xFFFF0000 + movdqu xmm7, [esp] ; mask to ignore dc value + add esp, byte 28 pxor xmm6, xmm6 ; zero LOOP_SSE2 0 diff -urp xvidcore-1.1.0-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm xvidcore-1.1.0/src/dct/x86_asm/fdct_mmx_ffmpeg.asm --- xvidcore-1.1.0-old/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/dct/x86_asm/fdct_mmx_ffmpeg.asm 2006-02-19 01:49:08.000000000 +0100 @@ -204,7 +204,7 @@ fdct_r_row: psllw mm4, SHIFT_FRW_COL movq mm6, mm0 psubsw mm2, mm1 - movq mm1, [fdct_tg_all_16 + 4*2] + movq mm1, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff] psubsw mm0, mm4 movq mm7, [%2 + %3*2 + 3*16] pmulhw mm1, mm0 @@ -216,9 +216,9 @@ fdct_r_row: psubsw mm5, mm7 paddsw mm1, mm5 paddsw mm4, mm7 - por mm1, [fdct_one_corr] + por mm1, [ebx + fdct_one_corr wrt ..gotoff] psllw mm2, SHIFT_FRW_COL + 1 - pmulhw mm5, [fdct_tg_all_16 + 4*2] + pmulhw mm5, [ebx + fdct_tg_all_16 + 4*2 wrt ..gotoff] movq mm7, mm4 psubsw mm3, [%2 + %3*2 + 5*16] psubsw mm4, mm6 @@ -230,34 +230,34 @@ fdct_r_row: movq mm6, mm2 movq [%1 + %3*2 + 4*16], mm4 paddsw mm2, mm3 - pmulhw mm2, [ocos_4_16] + pmulhw mm2, [ebx + ocos_4_16 wrt ..gotoff] psubsw mm6, mm3 - pmulhw mm6, [ocos_4_16] + pmulhw mm6, [ebx + ocos_4_16 wrt ..gotoff] psubsw mm5, mm0 - por mm5, [fdct_one_corr] + por mm5, [ebx + fdct_one_corr wrt ..gotoff] psllw mm1, SHIFT_FRW_COL - por mm2, [fdct_one_corr] + por mm2, [ebx + fdct_one_corr wrt ..gotoff] movq mm4, mm1 movq mm3, [%2 + %3*2 + 0*16] paddsw mm1, mm6 psubsw mm3, [%2 + %3*2 + 7*16] psubsw mm4, mm6 - movq mm0, [fdct_tg_all_16 + 0*2] + movq mm0, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff] psllw mm3, SHIFT_FRW_COL - movq mm6, [fdct_tg_all_16 + 8*2] + movq mm6, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff] pmulhw mm0, mm1 movq [%1 + %3*2 + 0*16], mm7 pmulhw mm6, mm4 movq [%1 + %3*2 + 6*16], mm5 movq mm7, mm3 - movq mm5, [fdct_tg_all_16 + 8*2] + movq mm5, [ebx + fdct_tg_all_16 + 8*2 wrt ..gotoff] psubsw mm7, mm2 paddsw mm3, mm2 pmulhw mm5, mm7 paddsw mm0, mm3 paddsw mm6, mm4 - pmulhw mm3, [fdct_tg_all_16 + 0*2] - por mm0, [fdct_one_corr] + pmulhw mm3, [ebx + fdct_tg_all_16 + 0*2 wrt ..gotoff] + por mm0, [ebx + fdct_one_corr wrt ..gotoff] paddsw mm5, mm7 psubsw mm7, mm6 movq [%1 + %3*2 + 1*16], mm0 @@ -287,28 +287,28 @@ fdct_r_row: movq mm6, mm5 punpckldq mm3, mm5 punpckhdq mm6, mm3 - movq mm3, [%3 + 0*2] - movq mm4, [%3 + 4*2] + movq mm3, [0*2 + %3] + movq mm4, [4*2 + %3] punpckldq mm2, mm0 pmaddwd mm3, mm0 punpckhdq mm1, mm2 - movq mm2, [%3 + 16*2] + movq mm2, [16*2 + %3] pmaddwd mm4, mm1 - pmaddwd mm0, [%3 + 8*2] - movq mm7, [%3 + 20*2] + pmaddwd mm0, [8*2 + %3] + movq mm7, [20*2 + %3] pmaddwd mm2, mm5 - paddd mm3, [fdct_r_row] + paddd mm3, [ebx + fdct_r_row wrt ..gotoff] pmaddwd mm7, mm6 - pmaddwd mm1, [%3 + 12*2] + pmaddwd mm1, [12*2 + %3] paddd mm3, mm4 - pmaddwd mm5, [%3 + 24*2] - pmaddwd mm6, [%3 + 28*2] + pmaddwd mm5, [24*2 + %3] + pmaddwd mm6, [28*2 + %3] paddd mm2, mm7 - paddd mm0, [fdct_r_row] + paddd mm0, [ebx + fdct_r_row wrt ..gotoff] psrad mm3, SHIFT_FRW_ROW - paddd mm2, [fdct_r_row] + paddd mm2, [ebx + fdct_r_row wrt ..gotoff] paddd mm0, mm1 - paddd mm5, [fdct_r_row] + paddd mm5, [ebx + fdct_r_row wrt ..gotoff] psrad mm2, SHIFT_FRW_ROW paddd mm5, mm6 psrad mm0, SHIFT_FRW_ROW @@ -336,23 +336,23 @@ fdct_r_row: psubsw mm1, mm5 pshufw mm2, mm0, 0x4E pshufw mm3, mm1, 0x4E - movq mm4, [%3 + 0*2] - movq mm6, [%3 + 4*2] - movq mm5, [%3 + 16*2] - movq mm7, [%3 + 20*2] + movq mm4, [ 0*2 + %3] + movq mm6, [ 4*2 + %3] + movq mm5, [16*2 + %3] + movq mm7, [20*2 + %3] pmaddwd mm4, mm0 pmaddwd mm5, mm1 pmaddwd mm6, mm2 pmaddwd mm7, mm3 - pmaddwd mm0, [%3 + 8*2] - pmaddwd mm2, [%3 + 12*2] - pmaddwd mm1, [%3 + 24*2] - pmaddwd mm3, [%3 + 28*2] + pmaddwd mm0, [ 8*2 + %3] + pmaddwd mm2, [12*2 + %3] + pmaddwd mm1, [24*2 + %3] + pmaddwd mm3, [28*2 + %3] paddd mm4, mm6 paddd mm5, mm7 paddd mm0, mm2 paddd mm1, mm3 - movq mm7, [fdct_r_row] + movq mm7, [ebx + fdct_r_row wrt ..gotoff] paddd mm4, mm7 paddd mm5, mm7 paddd mm0, mm7 @@ -377,6 +377,10 @@ cglobal %1 ;; Move the destination/source address to the eax register mov eax, [esp + 4] + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + ;; Process the columns (4 at a time) FDCT_COLUMN_COMMON eax, eax, 0 ; columns 0..3 FDCT_COLUMN_COMMON eax, eax, 4 ; columns 4..7 @@ -386,12 +390,12 @@ cglobal %1 %assign i 0 %rep 8 ;; Process the 'i'th row - %2 eax+2*i*8, eax+2*i*8, tab_frw_01234567+2*32*i + %2 eax+2*i*8, eax+2*i*8, ebx + tab_frw_01234567+2*32*i wrt ..gotoff %assign i i+1 %endrep %else mov ecx, 8 - mov edx, tab_frw_01234567 + mov edx, [ebx + tab_frw_01234567 wrt ..gotoff] ALIGN 8 .loop %2 eax, eax, edx @@ -401,6 +405,7 @@ ALIGN 8 jne .loop %endif + pop ebx ret .endfunc %endmacro @@ -411,6 +416,11 @@ ALIGN 8 SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + ;----------------------------------------------------------------------------- ; void fdct_mmx_ffmpeg(int16_t block[64]); ;----------------------------------------------------------------------------- diff -urp xvidcore-1.1.0-old/src/dct/x86_asm/fdct_mmx_skal.asm xvidcore-1.1.0/src/dct/x86_asm/fdct_mmx_skal.asm --- xvidcore-1.1.0-old/src/dct/x86_asm/fdct_mmx_skal.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/dct/x86_asm/fdct_mmx_skal.asm 2006-02-19 01:49:08.000000000 +0100 @@ -294,15 +294,15 @@ MMX_One: paddsw mm2, mm1 ; mm2: t6+t5 movq [%1+0*16], mm5 ; => out0 - movq mm4, [tan2] ; mm4 <= tan2 + movq mm4, [ebx + tan2 wrt ..gotoff] ; mm4 <= tan2 pmulhw mm4, mm7 ; tm03*tan2 - movq mm5, [tan2] ; mm5 <= tan2 + movq mm5, [ebx + tan2 wrt ..gotoff] ; mm5 <= tan2 psubsw mm4, mm6 ; out6 = tm03*tan2 - tm12 pmulhw mm5, mm6 ; tm12*tan2 paddsw mm5, mm7 ; out2 = tm12*tan2 + tm03 - movq mm6, [sqrt2] - movq mm7, [MMX_One] + movq mm6, [ebx + sqrt2 wrt ..gotoff] + movq mm7, [ebx + MMX_One wrt ..gotoff] pmulhw mm2, mm6 ; mm2: tp65 = (t6 + t5)*cos4 por mm5, mm7 ; correct out2 @@ -320,8 +320,8 @@ MMX_One: paddsw mm2, mm4 ; mm2: tp765 = t7 + tp65 paddsw mm1, mm5 ; mm1: tp465 = t4 + tm65 - movq mm4, [tan3] ; tan3 - 1 - movq mm5, [tan1] ; tan1 + movq mm4, [ebx + tan3 wrt ..gotoff] ; tan3 - 1 + movq mm5, [ebx + tan1 wrt ..gotoff] ; tan1 movq mm7, mm3 ; save tm465 pmulhw mm3, mm4 ; tm465*(tan3-1) @@ -364,23 +364,23 @@ MMX_One: punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1] punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3] - movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17] - movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19] + movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17] + movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19] pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17] - movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21] + movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21] pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19] - movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23] + movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23] pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21] - movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25] + movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25] pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23] - movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27] + movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27] pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25] paddd mm2, mm3 ; [ out0 | out1 ] pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] psrad mm2, 16 - pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] + pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29] paddd mm4, mm5 ; [ out2 | out3 ] - pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] + pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31] psrad mm4, 16 paddd mm6, mm7 ; [ out4 | out5 ] @@ -422,23 +422,23 @@ MMX_One: punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1] punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3] - movq mm2, qword [%3 + 0] ; [ M00 M01 M16 M17] - movq mm3, qword [%3 + 8] ; [ M02 M03 M18 M19] + movq mm2, qword [0 + %3] ; [ M00 M01 M16 M17] + movq mm3, qword [8 + %3] ; [ M02 M03 M18 M19] pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17] - movq mm4, qword [%3 + 16] ; [ M04 M05 M20 M21] + movq mm4, qword [16 + %3] ; [ M04 M05 M20 M21] pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19] - movq mm5, qword [%3 + 24] ; [ M06 M07 M22 M23] + movq mm5, qword [24 + %3] ; [ M06 M07 M22 M23] pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21] - movq mm6, qword [%3 + 32] ; [ M08 M09 M24 M25] + movq mm6, qword [32 + %3] ; [ M08 M09 M24 M25] pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23] - movq mm7, qword [%3 + 40] ; [ M10 M11 M26 M27] + movq mm7, qword [40 + %3] ; [ M10 M11 M26 M27] pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25] paddd mm2, mm3 ; [ out0 | out1 ] pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] psrad mm2, 16 - pmaddwd mm0, qword [%3 + 48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] + pmaddwd mm0, qword [48 + %3] ; [a0.M12+a1.M13 | b0.M28+b1.M29] paddd mm4, mm5 ; [ out2 | out3 ] - pmaddwd mm1, qword [%3 + 56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] + pmaddwd mm1, qword [56 + %3] ; [a0.M14+a1.M15 | b0.M30+b1.M31] psrad mm4, 16 paddd mm6, mm7 ; [ out4 | out5 ] @@ -467,12 +467,16 @@ MMX_One: ALIGN 16 cglobal %1 %1: + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + %ifdef UNROLLED_LOOP - mov ecx, [esp + 4] + mov ecx, [esp + 4 + 4] %else - push ebx + push esi push edi - mov ecx, [esp + 8 + 4] + mov ecx, [esp + 12 + 4] %endif fLLM_PASS ecx+0, ecx+0, 3 @@ -481,27 +485,28 @@ cglobal %1 %ifdef UNROLLED_LOOP %assign i 0 %rep 8 - %2 ecx+i*16, ecx+i*16, fdct_table+i*64, fdct_rounding_1+i*8, fdct_rounding_2+i*8 + %2 ecx+i*16, ecx+i*16, ebx + fdct_table+i*64 wrt ..gotoff, ebx + fdct_rounding_1+i*8 wrt ..gotoff, ebx + fdct_rounding_2+i*8 wrt ..gotoff %assign i i+1 %endrep %else mov eax, 8 - mov edx, fdct_table - mov ebx, fdct_rounding_1 - mov edi, fdct_rounding_2 + lea edx, [ebx + fdct_table wrt ..gotoff] + lea esi, [ebx + fdct_rounding_1 wrt ..gotoff] + lea edi, [ebx + fdct_rounding_2 wrt ..gotoff] .loop - %2 ecx, ecx, edx, ebx, edi + %2 ecx, ecx, edx, esi, edi add ecx, 2*8 add edx, 2*32 - add ebx, 2*4 + add esi, 2*4 add edi, 2*4 dec eax jne .loop pop edi - pop ebx + pop esi %endif + pop ebx ret .endfunc %endmacro @@ -512,6 +517,11 @@ cglobal %1 SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + ;----------------------------------------------------------------------------- ; void fdct_mmx_skal(int16_t block[64]]; ;----------------------------------------------------------------------------- diff -urp xvidcore-1.1.0-old/src/dct/x86_asm/fdct_sse2_skal.asm xvidcore-1.1.0/src/dct/x86_asm/fdct_sse2_skal.asm --- xvidcore-1.1.0-old/src/dct/x86_asm/fdct_sse2_skal.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/dct/x86_asm/fdct_sse2_skal.asm 2006-02-19 02:40:03.000000000 +0100 @@ -238,10 +238,10 @@ cglobal fdct_sse2_skal pshufd xmm6, xmm0, 01010101b ; [13131313] pshufd xmm7, xmm0, 11111111b ; [57575757] - pmaddwd xmm4, [%2+ 0] ; dot [M00,M01][M04,M05][M08,M09][M12,M13] - pmaddwd xmm5, [%2+16] ; dot [M02,M03][M06,M07][M10,M11][M14,M15] - pmaddwd xmm6, [%2+32] ; dot [M16,M17][M20,M21][M24,M25][M28,M29] - pmaddwd xmm7, [%2+48] ; dot [M18,M19][M22,M23][M26,M27][M30,M31] + pmaddwd xmm4, [ 0 + %2] ; dot [M00,M01][M04,M05][M08,M09][M12,M13] + pmaddwd xmm5, [16 + %2] ; dot [M02,M03][M06,M07][M10,M11][M14,M15] + pmaddwd xmm6, [32 + %2] ; dot [M16,M17][M20,M21][M24,M25][M28,M29] + pmaddwd xmm7, [48 + %2] ; dot [M18,M19][M22,M23][M26,M27][M30,M31] paddd xmm4, [%3] ; Round paddd xmm6, xmm7 ; [b0|b1|b2|b3] @@ -267,12 +267,12 @@ cglobal fdct_sse2_skal %macro iLLM_PASS 1 ; %1: src/dst - movdqa xmm0, [tan3] ; t3-1 + movdqa xmm0, [ebx + tan3 wrt ..gotoff] ; t3-1 movdqa xmm3, [%1+16*3] ; x3 movdqa xmm1, xmm0 ; t3-1 movdqa xmm5, [%1+16*5] ; x5 - movdqa xmm4, [tan1] ; t1 + movdqa xmm4, [ebx + tan1 wrt ..gotoff] ; t1 movdqa xmm6, [%1+16*1] ; x1 movdqa xmm7, [%1+16*7] ; x7 movdqa xmm2, xmm4 ; t1 @@ -290,7 +290,7 @@ cglobal fdct_sse2_skal psubsw xmm2, xmm7 ; x1*t1-x7 = tm17 - movdqa xmm3, [sqrt2] + movdqa xmm3, [ebx + sqrt2 wrt ..gotoff] movdqa xmm7, xmm4 movdqa xmm6, xmm2 psubsw xmm4, xmm1 ; tp17-tp35 = t1 @@ -310,7 +310,7 @@ cglobal fdct_sse2_skal paddsw xmm0, xmm0 ; 2.(t1+t2) = b1 paddsw xmm4, xmm4 ; 2.(t1-t2) = b2 - movdqa xmm7, [tan2] ; t2 + movdqa xmm7, [ebx + tan2 wrt ..gotoff] ; t2 movdqa xmm3, [%1+2*16] ; x2 movdqa xmm6, [%1+6*16] ; x6 movdqa xmm5, xmm7 ; t2 @@ -402,55 +402,58 @@ cglobal fdct_sse2_skal ALIGN 16 idct_sse2_skal: + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc - mov ecx, [esp+ 4] ; Src + mov ecx, [esp+ 4 +4] ; Src TEST_ROW ecx, .Row0_Round - iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 + iMTX_MULT 0, ebx + iTab1 wrt ..gotoff, ebx + 16*0 + Walken_Idct_Rounders wrt ..gotoff, 11 jmp .Row1 .Row0_Round - movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] + movdqa xmm0, [ebx + 16*8 + 8*0 + Walken_Idct_Rounders wrt ..gotoff] movdqa [ecx ], xmm0 .Row1 TEST_ROW ecx+16, .Row1_Round - iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 + iMTX_MULT 1, ebx + iTab2 wrt ..gotoff, ebx + 16*1 + Walken_Idct_Rounders wrt ..gotoff, 11 jmp .Row2 .Row1_Round - movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] + movdqa xmm0, [ebx + 16*8 + 16*1 + Walken_Idct_Rounders wrt ..gotoff] movdqa [ecx+16 ], xmm0 .Row2 TEST_ROW ecx+32, .Row2_Round - iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 + iMTX_MULT 2, ebx + iTab3 wrt ..gotoff, ebx + 16*2 + Walken_Idct_Rounders wrt ..gotoff, 11 jmp .Row3 .Row2_Round - movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] + movdqa xmm0, [ebx + 16*8 + 16*2 + Walken_Idct_Rounders wrt ..gotoff] movdqa [ecx+32 ], xmm0 .Row3 TEST_ROW ecx+48, .Row4 - iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11 + iMTX_MULT 3, ebx + iTab4 wrt ..gotoff, ebx + 16*3 + Walken_Idct_Rounders wrt ..gotoff, 11 .Row4 TEST_ROW ecx+64, .Row5 - iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11 + iMTX_MULT 4, ebx + iTab1 wrt ..gotoff, ebx + 16*4 + Walken_Idct_Rounders wrt ..gotoff, 11 .Row5 TEST_ROW ecx+80, .Row6 - iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11 + iMTX_MULT 5, ebx + iTab4 wrt ..gotoff, ebx + 16*5 + Walken_Idct_Rounders wrt ..gotoff, 11 .Row6 TEST_ROW ecx+96, .Row7 - iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11 + iMTX_MULT 6, ebx + iTab3 wrt ..gotoff, ebx + 16*6 + Walken_Idct_Rounders wrt ..gotoff, 11 .Row7 TEST_ROW ecx+112, .End - iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11 + iMTX_MULT 7, ebx + iTab2 wrt ..gotoff, ebx + 16*7 + Walken_Idct_Rounders wrt ..gotoff, 11 .End iLLM_PASS ecx - + pop ebx ret .endfunc @@ -507,15 +510,15 @@ idct_sse2_skal: paddsw xmm2, xmm1 ; xmm2: t6+t5 movdqa [%1+0*16], xmm5 ; => out0 - movdqa xmm4, [tan2] ; xmm4 <= tan2 + movdqa xmm4, [ebx + tan2 wrt ..gotoff] ; xmm4 <= tan2 pmulhw xmm4, xmm7 ; tm03*tan2 - movdqa xmm5, [tan2] ; xmm5 <= tan2 + movdqa xmm5, [ebx + tan2 wrt ..gotoff] ; xmm5 <= tan2 psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12 pmulhw xmm5, xmm6 ; tm12*tan2 paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03 - movdqa xmm6, [sqrt2] - movdqa xmm7, [Rounder1] + movdqa xmm6, [ebx + sqrt2 wrt ..gotoff] + movdqa xmm7, [ebx + Rounder1 wrt ..gotoff] pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4 por xmm5, xmm7 ; correct out2 @@ -533,8 +536,8 @@ idct_sse2_skal: paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65 paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65 - movdqa xmm4, [tan3] ; tan3 - 1 - movdqa xmm5, [tan1] ; tan1 + movdqa xmm4, [ebx + tan3 wrt ..gotoff] ; tan3 - 1 + movdqa xmm5, [ebx + tan1 wrt ..gotoff] ; tan1 movdqa xmm7, xmm3 ; save tm465 pmulhw xmm3, xmm4 ; tm465*(tan3-1) @@ -581,12 +584,12 @@ idct_sse2_skal: ; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7'] ; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ] - movdqa xmm1, [%2+16] - movdqa xmm3, [%2+32] + movdqa xmm1, [16+%2] + movdqa xmm3, [32+%2] pmaddwd xmm1, xmm2 pmaddwd xmm3, xmm0 - pmaddwd xmm2, [%2+48] - pmaddwd xmm0, [%2+ 0] + pmaddwd xmm2, [48+%2] + pmaddwd xmm0, [ 0+%2] paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ] paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ] @@ -601,22 +604,33 @@ idct_sse2_skal: movdqa [ecx+%1*16+0], xmm0 %endmacro +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + ;----------------------------------------------------------------------------- ; Function Forward DCT ;----------------------------------------------------------------------------- ALIGN 16 fdct_sse2_skal: - mov ecx, [esp+4] + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov ecx, [esp+4+4] fLLM_PASS ecx+0, 3 - fMTX_MULT 0, fTab1, Fdct_Rnd0 - fMTX_MULT 1, fTab2, Fdct_Rnd2 - fMTX_MULT 2, fTab3, Fdct_Rnd1 - fMTX_MULT 3, fTab4, Fdct_Rnd1 - fMTX_MULT 4, fTab1, Fdct_Rnd0 - fMTX_MULT 5, fTab4, Fdct_Rnd1 - fMTX_MULT 6, fTab3, Fdct_Rnd1 - fMTX_MULT 7, fTab2, Fdct_Rnd1 + fMTX_MULT 0, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff + fMTX_MULT 1, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd2 wrt ..gotoff + fMTX_MULT 2, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff + fMTX_MULT 3, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff + fMTX_MULT 4, ebx + fTab1 wrt ..gotoff, ebx + Fdct_Rnd0 wrt ..gotoff + fMTX_MULT 5, ebx + fTab4 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff + fMTX_MULT 6, ebx + fTab3 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff + fMTX_MULT 7, ebx + fTab2 wrt ..gotoff, ebx + Fdct_Rnd1 wrt ..gotoff + + pop ebx ret .endfunc diff -urp xvidcore-1.1.0-old/src/dct/x86_asm/idct_3dne.asm xvidcore-1.1.0/src/dct/x86_asm/idct_3dne.asm --- xvidcore-1.1.0-old/src/dct/x86_asm/idct_3dne.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/dct/x86_asm/idct_3dne.asm 2006-02-19 01:55:22.000000000 +0100 @@ -223,6 +223,11 @@ tab_i_35_xmm: SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + cglobal idct_3dne ;----------------------------------------------------------------------------- @@ -231,25 +236,29 @@ cglobal idct_3dne ALIGN 16 idct_3dne: - mov eax, [esp+4] + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp+4+4] ; DCT_8_INV_ROW_1_s [eax+64], [eax+64], tab_i_04_sse, rounder_4 ;rounder_4=0 pshufw mm0, [eax+64],10001000b ; x2 x0 x2 x0 - movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 pshufw mm1, [eax+64+8],10001000b ; x6 x4 x6 x4 - movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 + movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 pshufw mm2, [eax+64],11011101b ; x3 x1 x3 x1 pshufw mm5, [eax+64+8],11011101b ; x7 x5 x7 x5 - movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 - movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 ; - pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 ; + pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) pshufw mm1, [eax+80+8],10001000b ; x6 x4 x6 x4 @@ -260,12 +269,12 @@ idct_3dne: movq mm7, mm0 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 - movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 psubd mm7, mm2 ; ; a3-b3 a2-b2 paddd mm0, mm2 ; 0 free a3+b3 a2+b2 pshufw mm2, [eax+80],11011101b; x3 x1 x3 x1 pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 - pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -276,19 +285,19 @@ idct_3dne: movq [eax+64], mm6 ; 3 ; save y3 y2 y1 y0 stall2 ; DCT_8_INV_ROW_1_s [eax+80], [eax+80], tab_i_35_xmm, rounder_5 - movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 - movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 + movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 - paddd mm3, [rounder_5] ; +rounder stall 6 - paddd mm5, [rounder_5] ; +rounder + paddd mm3, [ebx + rounder_5 wrt ..gotoff] ; +rounder stall 6 + paddd mm5, [ebx + rounder_5 wrt ..gotoff] ; +rounder movq [eax+64+8], mm7 ; 7 ; save y7 y6 y5 y4 - movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) pshufw mm1, [eax+96+8],10001000b ; x6 x4 x6 x4 @@ -299,12 +308,12 @@ idct_3dne: movq mm7, mm5 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 - movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 psubd mm7, mm2 ; ; a3-b3 a2-b2 paddd mm5, mm2 ; 0 free a3+b3 a2+b2 pshufw mm2, [eax+96],11011101b; x3 x1 x3 x1 pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 - pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -315,19 +324,19 @@ idct_3dne: movq [eax+80], mm6 ; 3 ; save y3 y2 y1 y0 ; DCT_8_INV_ROW_1_s [eax+96], [eax+96], tab_i_26_xmm, rounder_6 - movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 - movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 + movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 - paddd mm3, [rounder_6] ; +rounder - paddd mm0, [rounder_6] ; +rounder + paddd mm3, [ebx + rounder_6 wrt ..gotoff] ; +rounder + paddd mm0, [ebx + rounder_6 wrt ..gotoff] ; +rounder movq [eax+80+8], mm7 ; 7 ; save y7 y6 - movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) pshufw mm1, [eax+112+8],10001000b ; x6 x4 x6 x4 @@ -338,12 +347,12 @@ idct_3dne: movq mm7, mm0 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 - movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 psubd mm7, mm2 ; ; a3-b3 a2-b2 paddd mm0, mm2 ; 0 free a3+b3 a2+b2 pshufw mm2, [eax+112],11011101b; x3 x1 x3 x1 pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 - pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -354,19 +363,19 @@ idct_3dne: movq [eax+96], mm6 ; 3 ; save y3 y2 y1 y0 stall2 ; DCT_8_INV_ROW_1_s [eax+112], [eax+112], tab_i_17_xmm, rounder_7 - movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 - movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 + movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 - paddd mm3, [rounder_7] ; +rounder stall 6 - paddd mm5, [rounder_7] ; +rounder + paddd mm3, [ebx + rounder_7 wrt ..gotoff] ; +rounder stall 6 + paddd mm5, [ebx + rounder_7 wrt ..gotoff] ; +rounder movq [eax+96+8], mm7 ; 7 ; save y7 y6 y5 y4 - movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) pshufw mm1, [eax+0+8],10001000b; x6 x4 x6 x4 @@ -377,12 +386,12 @@ idct_3dne: movq mm7, mm5 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 - movq mm3, [tab_i_04_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_04_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 psubd mm7, mm2 ; ; a3-b3 a2-b2 paddd mm5, mm2 ; 0 free a3+b3 a2+b2 pshufw mm2, [eax+0],11011101b ; x3 x1 x3 x1 pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 - pmaddwd mm0, [tab_i_04_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm0, [ebx + tab_i_04_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -393,19 +402,19 @@ idct_3dne: movq [eax+112], mm6 ; 3 ; save y3 y2 y1 y0 ; DCT_8_INV_ROW_1_s [eax+0], 0, tab_i_04_xmm, rounder_0 - movq mm4, [tab_i_04_xmm+8] ; 4 ; w07 w06 w03 w02 - movq mm6, [tab_i_04_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm4, [ebx + tab_i_04_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 + movq mm6, [ebx + tab_i_04_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 - paddd mm3, [rounder_0] ; +rounder - paddd mm0, [rounder_0] ; +rounder + paddd mm3, [ebx + rounder_0 wrt ..gotoff] ; +rounder + paddd mm0, [ebx + rounder_0 wrt ..gotoff] ; +rounder movq [eax+112+8], mm7 ; 7 ; save y7 y6 - movq mm7, [tab_i_04_xmm+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [ebx + tab_i_04_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_04_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_04_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_04_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_04_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm5, [tab_i_04_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm5, [ebx + tab_i_04_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm0, mm1 ; 1 pshufw mm1, [eax+16+8],10001000b ; x6 x4 x6 x4 @@ -416,12 +425,12 @@ idct_3dne: movq mm7, mm0 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 - movq mm3, [tab_i_17_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_17_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 psubd mm7, mm2 ; ; a3-b3 a2-b2 paddd mm0, mm2 ; 0 free a3+b3 a2+b2 pshufw mm2, [eax+16],11011101b; x3 x1 x3 x1 pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 - pmaddwd mm5, [tab_i_17_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm5, [ebx + tab_i_17_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -432,19 +441,19 @@ idct_3dne: movq [eax+0], mm6 ; 3 ; save y3 y2 y1 y0 stall2 ; DCT_8_INV_ROW_1_s [eax+16], 16, tab_i_17_xmm, rounder_1 - movq mm4, [tab_i_17_xmm+8] ; 4 ; w07 w06 w03 w02 - movq mm6, [tab_i_17_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm4, [ebx + tab_i_17_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 + movq mm6, [ebx + tab_i_17_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 - paddd mm3, [rounder_1] ; +rounder stall 6 - paddd mm5, [rounder_1] ; +rounder + paddd mm3, [ebx + rounder_1 wrt ..gotoff] ; +rounder stall 6 + paddd mm5, [ebx + rounder_1 wrt ..gotoff] ; +rounder movq [eax+0+8], mm7 ; 7 ; save y7 y6 y5 y4 - movq mm7, [tab_i_17_xmm+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [ebx + tab_i_17_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_17_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_17_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_17_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_17_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm0, [tab_i_17_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm0, [ebx + tab_i_17_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm5, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) pshufw mm1, [eax+32+8],10001000b ; x6 x4 x6 x4 @@ -455,12 +464,12 @@ idct_3dne: movq mm7, mm5 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 stall 5 paddd mm6, mm3 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 - movq mm3, [tab_i_26_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_26_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 psubd mm7, mm2 ; ; a3-b3 a2-b2 paddd mm5, mm2 ; 0 free a3+b3 a2+b2 pshufw mm2, [eax+32],11011101b; x3 x1 x3 x1 pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 - pmaddwd mm0, [tab_i_26_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm0, [ebx + tab_i_26_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -471,19 +480,19 @@ idct_3dne: movq [eax+16], mm6 ; 3 ; save y3 y2 y1 y0 ; DCT_8_INV_ROW_1_s [eax+32], 32, tab_i_26_xmm, rounder_2 - movq mm4, [tab_i_26_xmm+8] ; 4 ; w07 w06 w03 w02 - movq mm6, [tab_i_26_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm4, [ebx + tab_i_26_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 + movq mm6, [ebx + tab_i_26_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 STALL 6 - paddd mm3, [rounder_2] ; +rounder - paddd mm0, [rounder_2] ; +rounder + paddd mm3, [ebx + rounder_2 wrt ..gotoff] ; +rounder + paddd mm0, [ebx + rounder_2 wrt ..gotoff] ; +rounder movq [eax+16+8], mm7 ; 7 ; save y7 y6 - movq mm7, [tab_i_26_xmm+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [ebx + tab_i_26_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_26_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_26_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_26_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_26_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm5 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm5, [tab_i_26_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm5, [ebx + tab_i_26_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm0, mm1 ; 1 free ; a3=sum(even3) a2=sum(even2) pshufw mm1, [eax+48+8],10001000b ; x6 x4 x6 x4 @@ -494,12 +503,12 @@ idct_3dne: movq mm7, mm0 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 paddd mm6, mm3 ; mm6 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 - movq mm3, [tab_i_35_xmm] ; 3 ; w05 w04 w01 w00 + movq mm3, [ebx + tab_i_35_xmm wrt ..gotoff] ; 3 ; w05 w04 w01 w00 psubd mm7, mm2 ; ; a3-b3 a2-b2 paddd mm0, mm2 ; 0 free a3+b3 a2+b2 pshufw mm2, [eax+48],11011101b; x3 x1 x3 x1 pmaddwd mm3, mm5 ; x2*w05+x0*w04 x2*w01+x0*w00 - pmaddwd mm5, [tab_i_35_xmm+16]; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm5, [ebx + tab_i_35_xmm+16 wrt ..gotoff]; x2*w13+x0*w12 x2*w09+x0*w08 psrad mm4, SHIFT_INV_ROW ; y6=a1-b1 y7=a0-b0 psrad mm7, SHIFT_INV_ROW ; y4=a3-b3 y5=a2-b2 psrad mm6, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -510,26 +519,26 @@ idct_3dne: movq [eax+32], mm6 ; 3 ; save y3 y2 y1 y0 stall2 ; DCT_8_INV_ROW_1_s [eax+48], [eax+48], tab_i_35_xmm, rounder_3 - movq mm4, [tab_i_35_xmm+8] ; 4 ; w07 w06 w03 w02 - movq mm6, [tab_i_35_xmm+32] ; 6 ; w21 w20 w17 w16 + movq mm4, [ebx + tab_i_35_xmm+8 wrt ..gotoff] ; 4 ; w07 w06 w03 w02 + movq mm6, [ebx + tab_i_35_xmm+32 wrt ..gotoff] ; 6 ; w21 w20 w17 w16 pshufw mm7, mm7, 10110001b ; y7 y6 y5 y4 - paddd mm3, [rounder_3] ; +rounder stall 6 - paddd mm5, [rounder_3] ; +rounder + paddd mm3, [ebx + rounder_3 wrt ..gotoff] ; +rounder stall 6 + paddd mm5, [ebx + rounder_3 wrt ..gotoff] ; +rounder movq [eax+32+8], mm7 ; 7 ; save y7 y6 y5 y4 - movq mm7, [tab_i_35_xmm+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [ebx + tab_i_35_xmm+40 wrt ..gotoff] ; 7 ; w23 w22 w19 w18 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - pmaddwd mm1, [tab_i_35_xmm+24]; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [ebx + tab_i_35_xmm+24 wrt ..gotoff]; x6*w15+x4*w14 x6*w11+x4*w10 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 - pmaddwd mm2, [tab_i_35_xmm+48]; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [ebx + tab_i_35_xmm+48 wrt ..gotoff]; x3*w29+x1*w28 x3*w25+x1*w24 pmaddwd mm7, mm0 ; 7 ; x7*w23+x5*w22 x7*w19+x5*w18 ; w23 w22 w19 w18 - pmaddwd mm0, [tab_i_35_xmm+56]; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm0, [ebx + tab_i_35_xmm+56 wrt ..gotoff]; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm4 ; 4 free ; a1=sum(even1) a0=sum(even0) paddd mm5, mm1 ; mm1 free ; a3=sum(even3) a2=sum(even2) - movq mm1, [tg_3_16] + movq mm1, [ebx + tg_3_16 wrt ..gotoff] movq mm4, mm3 ; 4 ; a1 a0 paddd mm6, mm7 ; 7 free ; b1=sum(odd1) b0=sum(odd0) paddd mm2, mm0 ; 5 free ; b3=sum(odd3) b2=sum(odd2) - movq mm0, [tg_3_16] + movq mm0, [ebx + tg_3_16 wrt ..gotoff] movq mm7, mm5 ; 7 ; a3 a2 psubd mm4, mm6 ; 6 free ; a1-b1 a0-b0 paddd mm3, mm6 ; mm3 = mm3+mm6+mm5+mm4; a1+b1 a0+b0 @@ -542,7 +551,7 @@ idct_3dne: psrad mm2, SHIFT_INV_ROW ; y3=a3+b3 y2=a2+b2 movq mm6, [eax+16*1] packssdw mm7, mm4 ; 4 ; y6 y7 y4 y5 - movq mm4, [tg_1_16] + movq mm4, [ebx + tg_1_16 wrt ..gotoff] packssdw mm3, mm2 ; 0 free ; y3 y2 y1 y0 pshufw mm2, mm7, 10110001b ; y7 y6 y5 y4 @@ -559,7 +568,7 @@ idct_3dne: paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 movq [eax+48], mm3 ; 3 ; save y3 y2 y1 y0 - movq mm3, [ocos_4_16] + movq mm3, [ebx + ocos_4_16 wrt ..gotoff] paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 @@ -569,7 +578,7 @@ idct_3dne: psubsw mm6, mm0 ; tm17-tm35 = b3 psubsw mm4, mm1 ; tp17-tp35 = t1 paddsw mm2, mm0 ; tm17+tm35 = t2 - movq mm7, [tg_2_16] + movq mm7, [ebx + tg_2_16 wrt ..gotoff] movq mm1, mm4 ; t1 movq [eax+3*16], mm5 ; save b0 paddsw mm1, mm2 ; t1+t2 @@ -620,7 +629,7 @@ idct_3dne: movq mm6, mm2 ; a3 psraw mm4, SHIFT_INV_COL ; dst7 movq [eax+5*16], mm0 - movq mm0, [tg_3_16] + movq mm0, [ebx + tg_3_16 wrt ..gotoff] paddsw mm2, mm3 ; a3+b3 movq [eax+6*16], mm7 psubsw mm6, mm3 ; a3-b3 @@ -634,7 +643,7 @@ idct_3dne: movq mm5, [eax+8+16*5] psraw mm6, SHIFT_INV_COL ; dst4 pmulhw mm0, mm3 ; x3*(tg_3_16-1) - movq mm4, [tg_1_16] + movq mm4, [ebx + tg_1_16 wrt ..gotoff] pmulhw mm1, mm5 ; x5*(tg_3_16-1) movq mm7, [eax+8+16*7] movq [eax+3*16], mm2 @@ -646,7 +655,7 @@ idct_3dne: pmulhw mm2, mm6 ; x1*tg_1_16 paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 - movq mm3, [ocos_4_16] + movq mm3, [ebx + ocos_4_16 wrt ..gotoff] paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 @@ -655,7 +664,7 @@ idct_3dne: paddsw mm5, mm1 ; tp17+tp35 = b0 psubsw mm4, mm1 ; tp17-tp35 = t1 paddsw mm2, mm0 ; tm17+tm35 = t2 - movq mm7, [tg_2_16] + movq mm7, [ebx + tg_2_16 wrt ..gotoff] movq mm1, mm4 ; t1 psubsw mm6, mm0 ; tm17-tm35 = b3 movq [eax+8+3*16], mm5 ; save b0 @@ -717,6 +726,7 @@ idct_3dne: movq [eax+8+3*16], mm2 movq [eax+8+4*16], mm6 + pop ebx ret .endfunc diff -urp xvidcore-1.1.0-old/src/dct/x86_asm/idct_mmx.asm xvidcore-1.1.0/src/dct/x86_asm/idct_mmx.asm --- xvidcore-1.1.0-old/src/dct/x86_asm/idct_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/dct/x86_asm/idct_mmx.asm 2006-02-19 02:09:26.000000000 +0100 @@ -326,25 +326,25 @@ tab_i_35_xmm: punpcklwd mm0, mm1 ; x5 x1 x4 x0 movq mm5, mm0 ; 5 ; x5 x1 x4 x0 punpckldq mm0, mm0 ; x4 x0 x4 x0 - movq mm4, [%3+8] ; 4 ; w07 w05 w03 w01 + movq mm4, [8+%3] ; 4 ; w07 w05 w03 w01 punpckhwd mm2, mm1 ; 1 ; x7 x3 x6 x2 pmaddwd mm3, mm0 ; x4*w06+x0*w04 x4*w02+x0*w00 movq mm6, mm2 ; 6 ; x7 x3 x6 x2 - movq mm1, [%3+32] ; 1 ; w22 w20 w18 w16 + movq mm1, [32+%3] ; 1 ; w22 w20 w18 w16 punpckldq mm2, mm2 ; x6 x2 x6 x2 pmaddwd mm4, mm2 ; x6*w07+x2*w05 x6*w03+x2*w01 punpckhdq mm5, mm5 ; x5 x1 x5 x1 - pmaddwd mm0, [%3+16] ; x4*w14+x0*w12 x4*w10+x0*w08 + pmaddwd mm0, [16+%3] ; x4*w14+x0*w12 x4*w10+x0*w08 punpckhdq mm6, mm6 ; x7 x3 x7 x3 - movq mm7, [%3+40] ; 7 ; w23 w21 w19 w17 + movq mm7, [40+%3] ; 7 ; w23 w21 w19 w17 pmaddwd mm1, mm5 ; x5*w22+x1*w20 x5*w18+x1*w16 paddd mm3, [%4] ; +%4 pmaddwd mm7, mm6 ; x7*w23+x3*w21 x7*w19+x3*w17 - pmaddwd mm2, [%3+24] ; x6*w15+x2*w13 x6*w11+x2*w09 + pmaddwd mm2, [24+%3] ; x6*w15+x2*w13 x6*w11+x2*w09 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) - pmaddwd mm5, [%3+48] ; x5*w30+x1*w28 x5*w26+x1*w24 + pmaddwd mm5, [48+%3] ; x5*w30+x1*w28 x5*w26+x1*w24 movq mm4, mm3 ; 4 ; a1 a0 - pmaddwd mm6, [%3+56] ; x7*w31+x3*w29 x7*w27+x3*w25 + pmaddwd mm6, [56+%3] ; x7*w31+x3*w29 x7*w27+x3*w25 paddd mm1, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) paddd mm0, [%4] ; +%4 psubd mm3, mm1 ; a1-b1 a0-b0 @@ -378,25 +378,25 @@ tab_i_35_xmm: movq mm2, mm0 ; 2 ; x3 x2 x1 x0 movq mm3, [%3] ; 3 ; w05 w04 w01 w00 pshufw mm0, mm0, 10001000b ; x2 x0 x2 x0 - movq mm4, [%3+8] ; 4 ; w07 w06 w03 w02 + movq mm4, [8+%3] ; 4 ; w07 w06 w03 w02 movq mm5, mm1 ; 5 ; x7 x6 x5 x4 pmaddwd mm3, mm0 ; x2*w05+x0*w04 x2*w01+x0*w00 - movq mm6, [%3+32] ; 6 ; w21 w20 w17 w16 + movq mm6, [32+%3] ; 6 ; w21 w20 w17 w16 pshufw mm1, mm1, 10001000b ; x6 x4 x6 x4 pmaddwd mm4, mm1 ; x6*w07+x4*w06 x6*w03+x4*w02 - movq mm7, [%3+40] ; 7 ; w23 w22 w19 w18 + movq mm7, [40+%3] ; 7 ; w23 w22 w19 w18 pshufw mm2, mm2, 11011101b ; x3 x1 x3 x1 pmaddwd mm6, mm2 ; x3*w21+x1*w20 x3*w17+x1*w16 pshufw mm5, mm5, 11011101b ; x7 x5 x7 x5 pmaddwd mm7, mm5 ; x7*w23+x5*w22 x7*w19+x5*w18 paddd mm3, [%4] ; +%4 - pmaddwd mm0, [%3+16] ; x2*w13+x0*w12 x2*w09+x0*w08 + pmaddwd mm0, [16+%3] ; x2*w13+x0*w12 x2*w09+x0*w08 paddd mm3, mm4 ; 4 ; a1=sum(even1) a0=sum(even0) - pmaddwd mm1, [%3+24] ; x6*w15+x4*w14 x6*w11+x4*w10 + pmaddwd mm1, [24+%3] ; x6*w15+x4*w14 x6*w11+x4*w10 movq mm4, mm3 ; 4 ; a1 a0 - pmaddwd mm2, [%3+48] ; x3*w29+x1*w28 x3*w25+x1*w24 + pmaddwd mm2, [48+%3] ; x3*w29+x1*w28 x3*w25+x1*w24 paddd mm6, mm7 ; 7 ; b1=sum(odd1) b0=sum(odd0) - pmaddwd mm5, [%3+56] ; x7*w31+x5*w30 x7*w27+x5*w26 + pmaddwd mm5, [56+%3] ; x7*w31+x5*w30 x7*w27+x5*w26 paddd mm3, mm6 ; a1+b1 a0+b0 paddd mm0, [%4] ; +%4 psrad mm3, SHIFT_INV_ROW ; y1=a1+b1 y0=a0+b0 @@ -480,12 +480,12 @@ tab_i_35_xmm: ;----------------------------------------------------------------------------- %macro DCT_8_INV_COL 2 - movq mm0, [tg_3_16] + movq mm0, [ebx + tg_3_16 wrt ..gotoff] movq mm3, [%1+16*3] movq mm1, mm0 ; tg_3_16 movq mm5, [%1+16*5] pmulhw mm0, mm3 ; x3*(tg_3_16-1) - movq mm4, [tg_1_16] + movq mm4, [ebx + tg_1_16 wrt ..gotoff] pmulhw mm1, mm5 ; x5*(tg_3_16-1) movq mm7, [%1+16*7] movq mm2, mm4 ; tg_1_16 @@ -495,7 +495,7 @@ tab_i_35_xmm: pmulhw mm2, mm6 ; x1*tg_1_16 paddsw mm1, mm3 ; x3+x5*(tg_3_16-1) psubsw mm0, mm5 ; x3*tg_3_16-x5 = tm35 - movq mm3, [ocos_4_16] + movq mm3, [ebx + ocos_4_16 wrt ..gotoff] paddsw mm1, mm5 ; x3+x5*tg_3_16 = tp35 paddsw mm4, mm6 ; x1+tg_1_16*x7 = tp17 psubsw mm2, mm7 ; x1*tg_1_16-x7 = tm17 @@ -505,7 +505,7 @@ tab_i_35_xmm: psubsw mm6, mm0 ; tm17-tm35 = b3 psubsw mm4, mm1 ; tp17-tp35 = t1 paddsw mm2, mm0 ; tm17+tm35 = t2 - movq mm7, [tg_2_16] + movq mm7, [ebx + tg_2_16 wrt ..gotoff] movq mm1, mm4 ; t1 ; movq [SCRATCH+0], mm5 ; save b0 movq [%2+3*16], mm5 ; save b0 @@ -577,6 +577,11 @@ tab_i_35_xmm: SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + cglobal idct_mmx cglobal idct_xmm @@ -586,22 +591,27 @@ cglobal idct_xmm ALIGN 16 idct_mmx: - mov eax, dword [esp + 4] + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, dword [esp + 4 + 4] ;; Process each row - DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, tab_i_04_mmx, rounder_0 - DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, tab_i_17_mmx, rounder_1 - DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, tab_i_26_mmx, rounder_2 - DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, tab_i_35_mmx, rounder_3 - DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, tab_i_04_mmx, rounder_4 - DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, tab_i_35_mmx, rounder_5 - DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, tab_i_26_mmx, rounder_6 - DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, tab_i_17_mmx, rounder_7 + DCT_8_INV_ROW_MMX eax+0*16, eax+0*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_0 wrt ..gotoff + DCT_8_INV_ROW_MMX eax+1*16, eax+1*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_1 wrt ..gotoff + DCT_8_INV_ROW_MMX eax+2*16, eax+2*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_2 wrt ..gotoff + DCT_8_INV_ROW_MMX eax+3*16, eax+3*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_3 wrt ..gotoff + DCT_8_INV_ROW_MMX eax+4*16, eax+4*16, ebx + tab_i_04_mmx wrt ..gotoff, ebx + rounder_4 wrt ..gotoff + DCT_8_INV_ROW_MMX eax+5*16, eax+5*16, ebx + tab_i_35_mmx wrt ..gotoff, ebx + rounder_5 wrt ..gotoff + DCT_8_INV_ROW_MMX eax+6*16, eax+6*16, ebx + tab_i_26_mmx wrt ..gotoff, ebx + rounder_6 wrt ..gotoff + DCT_8_INV_ROW_MMX eax+7*16, eax+7*16, ebx + tab_i_17_mmx wrt ..gotoff, ebx + rounder_7 wrt ..gotoff ;; Process the columns (4 at a time) DCT_8_INV_COL eax+0, eax+0 DCT_8_INV_COL eax+8, eax+8 + pop ebx ret .endfunc @@ -611,22 +621,27 @@ idct_mmx: ALIGN 16 idct_xmm: - mov eax, dword [esp + 4] + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, dword [esp + 4 + 4] ;; Process each row - DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, tab_i_04_xmm, rounder_0 - DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, tab_i_17_xmm, rounder_1 - DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, tab_i_26_xmm, rounder_2 - DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, tab_i_35_xmm, rounder_3 - DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, tab_i_04_xmm, rounder_4 - DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, tab_i_35_xmm, rounder_5 - DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, tab_i_26_xmm, rounder_6 - DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, tab_i_17_xmm, rounder_7 + DCT_8_INV_ROW_XMM eax+0*16, eax+0*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_0 wrt ..gotoff + DCT_8_INV_ROW_XMM eax+1*16, eax+1*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_1 wrt ..gotoff + DCT_8_INV_ROW_XMM eax+2*16, eax+2*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_2 wrt ..gotoff + DCT_8_INV_ROW_XMM eax+3*16, eax+3*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_3 wrt ..gotoff + DCT_8_INV_ROW_XMM eax+4*16, eax+4*16, ebx + tab_i_04_xmm wrt ..gotoff, ebx + rounder_4 wrt ..gotoff + DCT_8_INV_ROW_XMM eax+5*16, eax+5*16, ebx + tab_i_35_xmm wrt ..gotoff, ebx + rounder_5 wrt ..gotoff + DCT_8_INV_ROW_XMM eax+6*16, eax+6*16, ebx + tab_i_26_xmm wrt ..gotoff, ebx + rounder_6 wrt ..gotoff + DCT_8_INV_ROW_XMM eax+7*16, eax+7*16, ebx + tab_i_17_xmm wrt ..gotoff, ebx + rounder_7 wrt ..gotoff ;; Process the columns (4 at a time) DCT_8_INV_COL eax+0, eax+0 DCT_8_INV_COL eax+8, eax+8 + pop ebx ret .endfunc diff -urp xvidcore-1.1.0-old/src/dct/x86_asm/idct_sse2_dmitry.asm xvidcore-1.1.0/src/dct/x86_asm/idct_sse2_dmitry.asm --- xvidcore-1.1.0-old/src/dct/x86_asm/idct_sse2_dmitry.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/dct/x86_asm/idct_sse2_dmitry.asm 2006-02-19 02:08:09.000000000 +0100 @@ -183,7 +183,7 @@ cglobal idct_sse2_dmitry ;a 3210 first part pshufd xmm2, xmm1, 10101010b ;x 64646464 - pmaddwd xmm2, [%3+16] ;w 15 14 11 10 7632 + pmaddwd xmm2, [16+%3] ;w 15 14 11 10 7632 ;a 3210 second part paddd xmm2, xmm0 ;a 3210 ready @@ -191,11 +191,11 @@ cglobal idct_sse2_dmitry movdqa xmm5, xmm2 pshufd xmm3, xmm1, 01010101b ;x 31313131 - pmaddwd xmm3, [%3+32] ;w 29 28 25 24 21 20 17 16 + pmaddwd xmm3, [32+%3] ;w 29 28 25 24 21 20 17 16 ;b 3210 first part pshufd xmm4, xmm1, 11111111b ;x 75757575 - pmaddwd xmm4, [%3+48] ;w 31 30 27 26 23 22 19 18 + pmaddwd xmm4, [48+%3] ;w 31 30 27 26 23 22 19 18 ;b 3210 second part paddd xmm3,xmm4 ;b 3210 ready @@ -220,7 +220,7 @@ cglobal idct_sse2_dmitry movdqa xmm4, [%1+16*2] ;x2 movdqa xmm5, [%1+16*6] ;x6 - movdqa xmm6, [tg_2_16] + movdqa xmm6, [ebx + tg_2_16 wrt ..gotoff] movdqa xmm7, xmm6 paddsw xmm0, xmm2 ;u04=x0+x4 @@ -245,12 +245,12 @@ cglobal idct_sse2_dmitry movdqa xmm0, [%1+16*1] ;x1 movdqa xmm1, [%1+16*7] ;x7 - movdqa xmm2, [tg_1_16] + movdqa xmm2, [ebx + tg_1_16 wrt ..gotoff] movdqa xmm3, xmm2 movdqa xmm4, [%1+16*3] ;x3 movdqa xmm5, [%1+16*5] ;x5 - movdqa xmm6, [tg_3_16] + movdqa xmm6, [ebx + tg_3_16 wrt ..gotoff] movdqa xmm7, xmm6 pmulhw xmm2, xmm0 @@ -267,7 +267,7 @@ cglobal idct_sse2_dmitry psubsw xmm6, xmm5 ;v35=x3*T3-x5 paddsw xmm7, xmm4 ;u35=x5*T3+x3 - movdqa xmm4, [ocos_4_16] + movdqa xmm4, [ebx + ocos_4_16 wrt ..gotoff] paddsw xmm0, xmm7 ;b0=u17+u35 psubsw xmm1, xmm6 ;b3=v17-v35 @@ -322,26 +322,35 @@ cglobal idct_sse2_dmitry movdqa [%2+16*5], xmm7 %endmacro +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + ;----------------------------------------------------------------------------- ; void idct_sse2_dmitry(int16_t coeff[64]); ;----------------------------------------------------------------------------- ALIGN 16 idct_sse2_dmitry: - - mov eax, [esp + 4] - - DCT_8_INV_ROW_1_SSE2 eax+ 0, eax+ 0, tab_i_04, rounder_2_0 - DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, tab_i_17, rounder_2_1 - DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, tab_i_26, rounder_2_2 - DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, tab_i_35, rounder_2_3 - DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, tab_i_04, rounder_2_4 - DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, tab_i_35, rounder_2_5 - DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, tab_i_26, rounder_2_6 - DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, tab_i_17, rounder_2_7 + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp + 4 + 4] + + DCT_8_INV_ROW_1_SSE2 eax+ 0, eax+ 0, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_0 wrt ..gotoff + DCT_8_INV_ROW_1_SSE2 eax+ 16, eax+ 16, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_1 wrt ..gotoff + DCT_8_INV_ROW_1_SSE2 eax+ 32, eax+ 32, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_2 wrt ..gotoff + DCT_8_INV_ROW_1_SSE2 eax+ 48, eax+ 48, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_3 wrt ..gotoff + DCT_8_INV_ROW_1_SSE2 eax+ 64, eax+ 64, ebx + tab_i_04 wrt ..gotoff, ebx + rounder_2_4 wrt ..gotoff + DCT_8_INV_ROW_1_SSE2 eax+ 80, eax+ 80, ebx + tab_i_35 wrt ..gotoff, ebx + rounder_2_5 wrt ..gotoff + DCT_8_INV_ROW_1_SSE2 eax+ 96, eax+ 96, ebx + tab_i_26 wrt ..gotoff, ebx + rounder_2_6 wrt ..gotoff + DCT_8_INV_ROW_1_SSE2 eax+112, eax+112, ebx + tab_i_17 wrt ..gotoff, ebx + rounder_2_7 wrt ..gotoff DCT_8_INV_COL_4_SSE2 eax, eax + pop ebx ret .endfunc diff -urp xvidcore-1.1.0-old/src/dct/x86_asm/simple_idct_mmx.asm xvidcore-1.1.0/src/dct/x86_asm/simple_idct_mmx.asm --- xvidcore-1.1.0-old/src/dct/x86_asm/simple_idct_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/dct/x86_asm/simple_idct_mmx.asm 2006-02-19 01:51:55.000000000 +0100 @@ -122,7 +122,7 @@ coeffs: movq mm1,[src4] ; R6 R2 r6 r2 movq mm2,[src1] ; R3 R1 r3 r1 movq mm3,[src5] ; R7 R5 r7 r5 - movq mm4,[wm1010] + movq mm4,[ebx + wm1010 wrt ..gotoff] pand mm4,mm0 por mm4,mm1 por mm4,mm2 @@ -131,29 +131,29 @@ coeffs: movd eax,mm4 or eax,eax jz near .skip1 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm5,[coeffs+32] ; C6 C2 C6 C2 + movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 - movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 + movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 - movq mm7,[coeffs+48] ; C3 C1 C3 C1 + movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 paddd mm4,mm5 ; A0 a0 psubd mm6,mm5 ; A3 a3 - movq mm5,[coeffs+56] ; C7 C5 C7 C5 + movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 rounder_op mm0, rounder_arg paddd mm1,mm0 ; A1 a1 paddd mm0,mm0 psubd mm0,mm1 ; A2 a2 - pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 + pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 paddd mm7,mm5 ; B0 b0 - movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 + movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 paddd mm7,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -170,14 +170,14 @@ coeffs: packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 movq [dst],mm7 movq mm1,[src1] ; R3 R1 r3 r1 - movq mm4,[coeffs+80] ;-C1 C5 -C1 C5 + movq mm4,[ebx + coeffs+80 wrt ..gotoff] ;-C1 C5 -C1 C5 movq [dst + 24],mm2 pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 - movq mm7,[coeffs+88] ; C3 C7 C3 C7 - pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 + pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 movq mm2,mm0 ; A2 a2 - pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 + pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 paddd mm4,mm7 ; B2 b2 paddd mm2,mm4 ; A2+B2 a2+b2 psubd mm0,mm4 ; a2-B2 a2-b2 @@ -196,7 +196,7 @@ coeffs: jmp short .skip2 .skip1 pslld mm0,16 - paddd mm0,[d40000] + paddd mm0,[ebx + d40000 wrt ..gotoff] psrad mm0,13 packssdw mm0,mm0 movq [ dst ],mm0 @@ -240,29 +240,29 @@ coeffs: movd eax,mm4 or eax,eax jz near bt - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm5,[coeffs+32] ; C6 C2 C6 C2 + movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 - movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 + movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 - movq mm7,[coeffs+48] ; C3 C1 C3 C1 + movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 paddd mm4,mm5 ; A0 a0 psubd mm6,mm5 ; A3 a3 - movq mm5,[coeffs+56] ; C7 C5 C7 C5 + movq mm5,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 pmaddwd mm5,mm3 ; C7R7+C5R5 C7r7+C5r5 rounder_op mm0, rounder_arg paddd mm1,mm0 ; A1 a1 paddd mm0,mm0 psubd mm0,mm1 ; A2 a2 - pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 + pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 paddd mm7,mm5 ; B0 b0 - movq mm5,[coeffs+72] ; -C5 -C1 -C5 -C1 + movq mm5,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 pmaddwd mm5,mm3 ; -C5R7-C1R5 -C5r7-C1r5 paddd mm7,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -279,14 +279,14 @@ coeffs: packssdw mm2,mm4 ; A0-B0 a0-b0 A1-B1 a1-b1 movq [ dst ],mm7 movq mm1,[src1] ; R3 R1 r3 r1 - movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 + movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 movq [ dst + 24 ],mm2 pmaddwd mm4,mm1 ; -C1R3+C5R1 -C1r3+C5r1 - movq mm7,[coeffs+88] ; C3 C7 C3 C7 - pmaddwd mm1,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 + pmaddwd mm1,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 movq mm2,mm0 ; A2 a2 - pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 + pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 paddd mm4,mm7 ; B2 b2 paddd mm2,mm4 ; A2+B2 a2+b2 psubd mm0,mm4 ; a2-B2 a2-b2 @@ -330,17 +330,17 @@ coeffs: movq mm1,[src4] ; R6 R2 r6 r2 movq mm2,[src1] ; R3 R1 r3 r1 movq mm3,[src5] ; R7 R5 r7 r5 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm5,[coeffs+32] ; C6 C2 C6 C2 + movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 - movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 + movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 ; rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 - movq mm7,[coeffs+48] ; C3 C1 C3 C1 + movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 ; rounder_op mm0, rounder_arg pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 paddd mm4,mm5 ; A0 a0 @@ -348,11 +348,11 @@ coeffs: movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 paddd mm0,mm1 ; A1 a1 psubd mm5,mm1 ; A2 a2 - movq mm1,[coeffs+56] ; C7 C5 C7 C5 + movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 - pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 + pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 paddd mm7,mm1 ; B0 b0 - movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 + movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 paddd mm7,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -374,13 +374,13 @@ coeffs: packssdw mm4,mm4 ; A0-B0 a0-b0 movd [ dst + 112],mm4 movq mm0,[src1] ; R3 R1 r3 r1 - movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 + movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 - movq mm7,[coeffs+88] ; C3 C7 C3 C7 - pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 + pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 movq mm2,mm5 ; A2 a2 - pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 + pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 paddd mm4,mm7 ; B2 b2 paddd mm2,mm4 ; A2+B2 a2+b2 psubd mm5,mm4 ; a2-B2 a2-b2 @@ -426,13 +426,13 @@ coeffs: movq mm0,[src0] ; R4 R0 r4 r0 movq mm1,[src4] ; R6 R2 r6 r2 movq mm3,[src5] ; R7 R5 r7 r5 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm5,[coeffs+32] ; C6 C2 C6 C2 + movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 - movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 + movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 ; rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 @@ -442,9 +442,9 @@ coeffs: movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 paddd mm0,mm1 ; A1 a1 psubd mm5,mm1 ; A2 a2 - movq mm1,[coeffs+56] ; C7 C5 C7 C5 + movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 - movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 + movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 paddd mm1,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -464,10 +464,10 @@ coeffs: movd [ dst + 96 ],mm2 packssdw mm4,mm4 ; A0-B0 a0-b0 movd [ dst + 112 ],mm4 - movq mm1,[coeffs+88] ; C3 C7 C3 C7 + movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 movq mm2,mm5 ; A2 a2 - pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 + pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 paddd mm2,mm1 ; A2+B2 a2+b2 psubd mm5,mm1 ; a2-B2 a2-b2 psrad mm2,shift @@ -510,17 +510,17 @@ coeffs: %define shift %8 movq mm0,[src0] ; R4 R0 r4 r0 movq mm3,[src5] ; R7 R5 r7 r5 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 ; rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 ; rounder_op mm0, rounder_arg movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm1,[coeffs+56] ; C7 C5 C7 C5 + movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 - movq mm7,[coeffs+72] ; -C5 -C1 -C5 -C1 + movq mm7,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 pmaddwd mm7,mm3 ; -C5R7-C1R5 -C5r7-C1r5 paddd mm1,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -540,10 +540,10 @@ coeffs: movd [ dst + 96 ],mm2 packssdw mm4,mm4 ; A0-B0 a0-b0 movd [ dst + 112 ],mm4 - movq mm1,[coeffs+88] ; C3 C7 C3 C7 + movq mm1,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 pmaddwd mm1,mm3 ; C3R7+C7R5 C3r7+C7r5 movq mm2,mm5 ; A2 a2 - pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 + pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 paddd mm2,mm1 ; A2+B2 a2+b2 psubd mm5,mm1 ; a2-B2 a2-b2 psrad mm2,shift @@ -587,21 +587,21 @@ coeffs: movq mm0,[src0] ; R4 R0 r4 r0 movq mm2,[src1] ; R3 R1 r3 r1 movq mm3,[src5] ; R7 R5 r7 r5 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 ; rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 - movq mm7,[coeffs+48] ; C3 C1 C3 C1 + movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 ; rounder_op mm0, rounder_arg pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm1,[coeffs+56] ; C7 C5 C7 C5 + movq mm1,[ebx + coeffs+56 wrt ..gotoff] ; C7 C5 C7 C5 pmaddwd mm1,mm3 ; C7R7+C5R5 C7r7+C5r5 - pmaddwd mm2,[coeffs+64] ; -C7R3+C3R1 -C7r3+C3r1 + pmaddwd mm2,[ebx + coeffs+64 wrt ..gotoff] ; -C7R3+C3R1 -C7r3+C3r1 paddd mm7,mm1 ; B0 b0 - movq mm1,[coeffs+72] ; -C5 -C1 -C5 -C1 + movq mm1,[ebx + coeffs+72 wrt ..gotoff] ; -C5 -C1 -C5 -C1 pmaddwd mm1,mm3 ; -C5R7-C1R5 -C5r7-C1r5 paddd mm7,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -623,13 +623,13 @@ coeffs: packssdw mm4,mm4 ; A0-B0 a0-b0 movd [dst + 112],mm4 movq mm0,[src1] ; R3 R1 r3 r1 - movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 + movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 pmaddwd mm4,mm0 ; -C1R3+C5R1 -C1r3+C5r1 - movq mm7,[coeffs+88] ; C3 C7 C3 C7 - pmaddwd mm0,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 + movq mm7,[ebx + coeffs+88 wrt ..gotoff] ; C3 C7 C3 C7 + pmaddwd mm0,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 pmaddwd mm7,mm3 ; C3R7+C7R5 C3r7+C7r5 movq mm2,mm5 ; A2 a2 - pmaddwd mm3,[coeffs+104] ; -C1R7+C3R5 -C1r7+C3r5 + pmaddwd mm3,[ebx + coeffs+104 wrt ..gotoff] ; -C1R7+C3R5 -C1r7+C3r5 paddd mm4,mm7 ; B2 b2 paddd mm2,mm4 ; A2+B2 a2+b2 psubd mm5,mm4 ; a2-B2 a2-b2 @@ -674,17 +674,17 @@ coeffs: %define shift %8 movq mm0,[src0] ; R4 R0 r4 r0 movq mm2,[src1] ; R3 R1 r3 r1 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 ; rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 - movq mm7,[coeffs+48] ; C3 C1 C3 C1 + movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 ; rounder_op mm0, rounder_arg pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm3,[coeffs+64] + movq mm3,[ebx + coeffs+64 wrt ..gotoff] pmaddwd mm3,mm2 ; -C7R3+C3R1 -C7r3+C3r1 paddd mm7,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -704,9 +704,9 @@ coeffs: movd [dst + 96],mm1 packssdw mm4,mm4 ; A0-B0 a0-b0 movd [dst + 112],mm4 - movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 + movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 - pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 movq mm1,mm5 ; A2 a2 paddd mm1,mm4 ; A2+B2 a2+b2 psubd mm5,mm4 ; a2-B2 a2-b2 @@ -750,13 +750,13 @@ coeffs: %define shift %8 movq mm0,[src0] ; R4 R0 r4 r0 movq mm1,[src4] ; R6 R2 r6 r2 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm5,[coeffs+32] ; C6 C2 C6 C2 + movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 - movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 + movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 ; rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 @@ -768,13 +768,13 @@ coeffs: psubd mm5,mm1 ; A2 a2 movq mm2,[src0 + 8] ; R4 R0 r4 r0 movq mm3,[src4 + 8] ; R6 R2 r6 r2 - movq mm1,[coeffs+16] ; C4 C4 C4 C4 + movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 - movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm7,[coeffs+32] ; C6 C2 C6 C2 + movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 pmaddwd mm7,mm3 ; C6R6+C2R2 C6r6+C2r2 - pmaddwd mm3,[coeffs+40] ; -C2R6+C6R2 -C2r6+C6r2 + pmaddwd mm3,[ebx + coeffs+40 wrt ..gotoff] ; -C2R6+C6R2 -C2r6+C6r2 ; rounder_op mm1, rounder_arg paddd mm7,mm1 ; A0 a0 paddd mm1,mm1 ; 2C0 2c0 @@ -829,17 +829,17 @@ coeffs: movq mm0,[src0] ; R4 R0 r4 r0 movq mm1,[src4] ; R6 R2 r6 r2 movq mm2,[src1] ; R3 R1 r3 r1 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm5,[coeffs+32] ; C6 C2 C6 C2 + movq mm5,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 pmaddwd mm5,mm1 ; C6R6+C2R2 C6r6+C2r2 - movq mm6,[coeffs+40] ; -C2 C6 -C2 C6 + movq mm6,[ebx + coeffs+40 wrt ..gotoff] ; -C2 C6 -C2 C6 pmaddwd mm1,mm6 ; -C2R6+C6R2 -C2r6+C6r2 ; rounder_op mm4, rounder_arg movq mm6,mm4 ; C4R4+C4R0 C4r4+C4r0 - movq mm7,[coeffs+48] ; C3 C1 C3 C1 + movq mm7,[ebx + coeffs+48 wrt ..gotoff] ; C3 C1 C3 C1 ; rounder_op mm0, rounder_arg pmaddwd mm7,mm2 ; C3R3+C1R1 C3r3+C1r1 paddd mm4,mm5 ; A0 a0 @@ -847,7 +847,7 @@ coeffs: movq mm5,mm0 ; -C4R4+C4R0 -C4r4+C4r0 paddd mm0,mm1 ; A1 a1 psubd mm5,mm1 ; A2 a2 - movq mm1,[coeffs+64] + movq mm1,[ebx + coeffs+64 wrt ..gotoff] pmaddwd mm1,mm2 ; -C7R3+C3R1 -C7r3+C3r1 paddd mm7,mm4 ; A0+B0 a0+b0 paddd mm4,mm4 ; 2A0 2a0 @@ -867,9 +867,9 @@ coeffs: movd [dst + 96],mm3 packssdw mm4,mm4 ; A0-B0 a0-b0 movd [dst + 112],mm4 - movq mm4,[coeffs+80] ; -C1 C5 -C1 C5 + movq mm4,[ebx + coeffs+80 wrt ..gotoff] ; -C1 C5 -C1 C5 pmaddwd mm4,mm2 ; -C1R3+C5R1 -C1r3+C5r1 - pmaddwd mm2,[coeffs+96] ; -C5R3+C7R1 -C5r3+C7r1 + pmaddwd mm2,[ebx + coeffs+96 wrt ..gotoff] ; -C5R3+C7R1 -C5r3+C7r1 movq mm3,mm5 ; A2 a2 paddd mm3,mm4 ; A2+B2 a2+b2 psubd mm5,mm4 ; a2-B2 a2-b2 @@ -912,20 +912,20 @@ coeffs: %define rounder_arg %7 %define shift %8 movq mm0,[src0] ; R4 R0 r4 r0 - movq mm4,[coeffs+16] ; C4 C4 C4 C4 + movq mm4,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm4,mm0 ; C4R4+C4R0 C4r4+C4r0 - movq mm5,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm5,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm0,mm5 ; -C4R4+C4R0 -C4r4+C4r0 ; rounder_op mm4, rounder_arg ; rounder_op mm0, rounder_arg psrad mm4,shift psrad mm0,shift movq mm2,[src0 + 8] ; R4 R0 r4 r0 - movq mm1,[coeffs+16] ; C4 C4 C4 C4 + movq mm1,[ebx + coeffs+16 wrt ..gotoff] ; C4 C4 C4 C4 pmaddwd mm1,mm2 ; C4R4+C4R0 C4r4+C4r0 - movq mm7,[coeffs+24] ; -C4 C4 -C4 C4 + movq mm7,[ebx + coeffs+24 wrt ..gotoff] ; -C4 C4 -C4 C4 pmaddwd mm2,mm7 ; -C4R4+C4R0 -C4r4+C4r0 - movq mm7,[coeffs+32] ; C6 C2 C6 C2 + movq mm7,[ebx + coeffs+32 wrt ..gotoff] ; C6 C2 C6 C2 ; rounder_op mm1, rounder_arg ; rounder_op mm2, rounder_arg psrad mm1,shift @@ -1073,6 +1073,11 @@ coeffs: SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + cglobal simple_idct_mmx_P cglobal simple_idct_mmx @@ -1083,14 +1088,18 @@ cglobal simple_idct_mmx ALIGN 16 simple_idct_mmx_P: + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + sub esp, 128 - mov edx, [esp+128+4] + mov edx, [esp+128+4+4] ; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt - DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 - Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .four - Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .two - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .one + DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11 + Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .four + Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .two + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .one IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1099,8 +1108,8 @@ simple_idct_mmx_P: ALIGN 16 .four - Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .six - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .five + Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .six + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .five IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1109,7 +1118,7 @@ ALIGN 16 ALIGN 16 .six - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .seven + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .seven IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1118,7 +1127,7 @@ ALIGN 16 ALIGN 16 .two - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .three + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .three IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1159,6 +1168,7 @@ ALIGN 16 .ret add esp, 128 + pop ebx ret .endfunc @@ -1174,15 +1184,19 @@ ALIGN 16 ALIGN 16 simple_idct_mmx: + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + sub esp, 128 - mov edx, [esp+128+4] + mov edx, [esp+128+4+4] PERMUTEP edx ; permute parm list in place ; src0, src4, src1, src5, dst, rndop, rndarg, shift, bt - DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [coeffs+8], 11 - Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [coeffs], 11, .fourP - Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .twoP - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .oneP + DC_COND_IDCT edx+0, edx+8, edx+16, edx+24, esp, paddd, [ebx + coeffs+8 wrt ..gotoff], 11 + Z_COND_IDCT edx+32, edx+40, edx+48, edx+56, esp+32, paddd, [ebx + coeffs wrt ..gotoff], 11, .fourP + Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .twoP + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .oneP IDCT0 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT0 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT0 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1191,8 +1205,8 @@ simple_idct_mmx: ALIGN 16 .fourP - Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [coeffs], 11, .sixP - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .fiveP + Z_COND_IDCT edx+64, edx+72, edx+80, edx+88, esp+64, paddd, [ebx + coeffs wrt ..gotoff], 11, .sixP + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .fiveP IDCT4 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT4 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT4 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1201,7 +1215,7 @@ ALIGN 16 ALIGN 16 .sixP - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .sevenP + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .sevenP IDCT6 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT6 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT6 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1210,7 +1224,7 @@ ALIGN 16 ALIGN 16 .twoP - Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [coeffs], 11, .threeP + Z_COND_IDCT edx+96, edx+104,edx+112,edx+120,esp+96, paddd, [ebx + coeffs wrt ..gotoff], 11, .threeP IDCT2 esp, esp+64, esp+32, esp+96, edx, nop, 0, 20 IDCT2 esp+8, esp+72, esp+40, esp+104,edx+4, nop, 0, 20 IDCT2 esp+16, esp+80, esp+48, esp+112,edx+8, nop, 0, 20 @@ -1251,6 +1265,7 @@ ALIGN 16 .retP add esp, 128 + pop ebx ret .endfunc diff -urp xvidcore-1.1.0-old/src/image/x86_asm/colorspace_mmx.inc xvidcore-1.1.0/src/image/x86_asm/colorspace_mmx.inc --- xvidcore-1.1.0-old/src/image/x86_asm/colorspace_mmx.inc 2005-12-30 15:34:57.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/colorspace_mmx.inc 2006-02-19 01:49:08.000000000 +0100 @@ -56,11 +56,13 @@ NAME: push edi ; esp + localsize + 4 push ebp ; esp + localsize + 0 + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + %define x_dif esp + localsize - 4 %define y_dif esp + localsize - 8 %define uv_dif esp + localsize - 12 %define fixed_width esp + localsize - 16 -%define tmp_height esp + localsize - 20 sub esp, localsize @@ -90,8 +92,6 @@ NAME: mov esi, [y_ptr] ; $esi$ = y_ptr mov edi, [x_ptr] ; $edi$ = x_ptr mov edx, [x_stride] ; $edx$ = x_stride - mov ebp, [height] ; $ebp$ = height - mov ebx, [vflip] or ebx, ebx @@ -106,7 +106,7 @@ NAME: sub ebx, edx mov [x_dif], ebx ; x_dif = -BYTES*fixed_width - x_stride - mov eax, ebp + mov eax, [height] sub eax, 1 push edx mul edx @@ -126,8 +126,6 @@ NAME: FUNC %+ _INIT ARG1, ARG2 ; call FUNC_INIT .y_loop - mov [tmp_height], ebp - mov ebp, [fixed_width] .x_loop FUNC ARG1, ARG2 ; call FUNC @@ -137,10 +135,9 @@ NAME: add ebx, PIXELS/2 ; u_ptr += PIXELS/2 add ecx, PIXELS/2 ; v_ptr += PIXELS/2 - sub ebp, PIXELS ; $ebp$ -= PIXELS + sub dword [fixed_width], PIXELS ; $ebp$ -= PIXELS jg .x_loop ; if ($ebp$ > 0) goto .x_loop - mov ebp, [tmp_height] add edi, [x_dif] ; x_ptr += x_dif + (VPIXELS-1)*x_stride add esi, [y_dif] ; y_ptr += y_dif + (VPIXELS-1)*y_stride %rep VPIXELS-1 @@ -155,7 +152,7 @@ NAME: add ecx, [uv_stride] %endrep - sub ebp, VPIXELS ; $ebp$ -= VPIXELS + sub dword [height], VPIXELS ; $ebp$ -= VPIXELS jg .y_loop ; if ($ebp$ > 0) goto .y_loop ; cleanup stack & undef everything @@ -181,7 +178,6 @@ NAME: %undef y_dif %undef uv_dif %undef fixed_width -%undef tmp_height ret .endfunc %undef NAME diff -urp xvidcore-1.1.0-old/src/image/x86_asm/colorspace_rgb_mmx.asm xvidcore-1.1.0/src/image/x86_asm/colorspace_rgb_mmx.asm --- xvidcore-1.1.0-old/src/image/x86_asm/colorspace_rgb_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/colorspace_rgb_mmx.asm 2006-02-19 01:49:08.000000000 +0100 @@ -120,7 +120,7 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, ;------------------------------------------------------------------------------ %macro BGR_TO_YV12_INIT 2 - movq mm7, [y_mul] + movq mm7, [ebp + y_mul wrt ..gotoff] %endmacro @@ -184,8 +184,8 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, ; u_ptr, v_ptr movq mm0, mm6 ; = [ |b4|g4|r4] - pmaddwd mm6, [v_mul] ; *= V_MUL - pmaddwd mm0, [u_mul] ; *= U_MUL + pmaddwd mm6, [ebp + v_mul wrt ..gotoff] ; *= V_MUL + pmaddwd mm0, [ebp + u_mul wrt ..gotoff] ; *= U_MUL movq mm1, mm0 movq mm2, mm6 psrlq mm1, 32 @@ -230,30 +230,30 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, movd mm3, [ecx] ; v_ptr[0] punpcklbw mm2, mm7 ; u3u2u1u0 -> mm2 punpcklbw mm3, mm7 ; v3v2v1v0 -> mm3 - psubsw mm2, [U_SUB] ; U - 128 - psubsw mm3, [V_SUB] ; V - 128 + psubsw mm2, [ebp + U_SUB wrt ..gotoff] ; U - 128 + psubsw mm3, [ebp + V_SUB wrt ..gotoff] ; V - 128 movq mm4, mm2 movq mm5, mm3 - pmullw mm2, [UG_MUL] - pmullw mm3, [VG_MUL] + pmullw mm2, [ebp + UG_MUL wrt ..gotoff] + pmullw mm3, [ebp + VG_MUL wrt ..gotoff] movq mm6, mm2 ; u3u2u1u0 -> mm6 punpckhwd mm2, mm2 ; u3u3u2u2 -> mm2 punpcklwd mm6, mm6 ; u1u1u0u0 -> mm6 - pmullw mm4, [UB_MUL] ; B_ADD -> mm4 + pmullw mm4, [ebp + UB_MUL wrt ..gotoff] ; B_ADD -> mm4 movq mm0, mm3 punpckhwd mm3, mm3 ; v3v3v2v2 -> mm2 punpcklwd mm0, mm0 ; v1v1v0v0 -> mm6 paddsw mm2, mm3 paddsw mm6, mm0 - pmullw mm5, [VR_MUL] ; R_ADD -> mm5 + pmullw mm5, [ebp + VR_MUL wrt ..gotoff] ; R_ADD -> mm5 movq mm0, [esi] ; y7y6y5y4y3y2y1y0 -> mm0 movq mm1, mm0 punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1 punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0 - psubsw mm0, [Y_SUB] ; Y - Y_SUB - psubsw mm1, [Y_SUB] ; Y - Y_SUB - pmullw mm1, [Y_MUL] - pmullw mm0, [Y_MUL] + psubsw mm0, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB + psubsw mm1, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB + pmullw mm1, [ebp + Y_MUL wrt ..gotoff] + pmullw mm0, [ebp + Y_MUL wrt ..gotoff] movq [TEMP_Y2], mm1 ; y7y6y5y4 -> mm3 movq [TEMP_Y1], mm0 ; y3y2y1y0 -> mm7 psubsw mm1, mm2 ; g7g6g5g4 -> mm1 @@ -266,10 +266,10 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, movq mm1, mm0 punpckhbw mm1, mm7 ; y7y6y5y4 -> mm1 punpcklbw mm0, mm7 ; y3y2y1y0 -> mm0 - psubsw mm0, [Y_SUB] ; Y - Y_SUB - psubsw mm1, [Y_SUB] ; Y - Y_SUB - pmullw mm1, [Y_MUL] - pmullw mm0, [Y_MUL] + psubsw mm0, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB + psubsw mm1, [ebp + Y_SUB wrt ..gotoff] ; Y - Y_SUB + pmullw mm1, [ebp + Y_MUL wrt ..gotoff] + pmullw mm0, [ebp + Y_MUL wrt ..gotoff] movq mm3, mm1 psubsw mm1, mm2 ; g7g6g5g4 -> mm1 movq mm2, mm0 @@ -419,6 +419,11 @@ BRIGHT: db 128, 128, 128, 128, 128, 128, SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + %include "colorspace_mmx.inc" ; input diff -urp xvidcore-1.1.0-old/src/image/x86_asm/colorspace_yuyv_mmx.asm xvidcore-1.1.0/src/image/x86_asm/colorspace_yuyv_mmx.asm --- xvidcore-1.1.0-old/src/image/x86_asm/colorspace_yuyv_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/colorspace_yuyv_mmx.asm 2006-02-19 01:49:08.000000000 +0100 @@ -76,7 +76,7 @@ mmx_one: dw 1, 1, 1, 1 ;----------------------------------------------------------------------------- %macro YUYV_TO_YV12_INIT 2 - movq mm7, [yuyv_mask] + movq mm7, [ebp + yuyv_mask wrt ..gotoff] %endmacro @@ -108,8 +108,8 @@ mmx_one: dw 1, 1, 1, 1 pand mm5, mm7 pand mm6, mm7 paddw mm5, mm6 - paddw mm4, [mmx_one] ; +1 rounding - paddw mm5, [mmx_one] ; + paddw mm4, [ebp + mmx_one wrt ..gotoff] ; +1 rounding + paddw mm5, [ebp + mmx_one wrt ..gotoff] ; psrlw mm4, 1 psrlw mm5, 1 ;---[ 3dnow/xmm ]---------------------------------------------------- @@ -310,6 +310,11 @@ mmx_one: dw 1, 1, 1, 1 SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + %include "colorspace_mmx.inc" ; input diff -urp xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_3dn.asm xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_3dn.asm --- xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_3dn.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_3dn.asm 2006-02-19 02:45:55.000000000 +0100 @@ -44,20 +44,6 @@ BITS 32 %endmacro ;============================================================================= -; Read Only data -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 -mmx_one: - times 8 db 1 - -;============================================================================= ; Code ;============================================================================= @@ -132,7 +118,10 @@ interpolate8x8_halfpel_h_3dn: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 COPY_H_3DN_RND1 lea ecx, [ecx+2*edx] COPY_H_3DN_RND1 @@ -206,7 +195,10 @@ interpolate8x8_halfpel_v_3dn: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 movq mm2, [eax] ; loop invariant add eax, edx @@ -329,7 +321,10 @@ interpolate8x8_halfpel_hv_3dn mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; stride - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j movq mm2, [eax] @@ -387,7 +382,10 @@ interpolate8x4_halfpel_h_3dn: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 COPY_H_3DN_RND1 lea ecx, [ecx+2*edx] COPY_H_3DN_RND1 @@ -424,7 +422,10 @@ interpolate8x4_halfpel_v_3dn: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 movq mm2, [eax] ; loop invariant add eax, edx @@ -462,7 +463,10 @@ interpolate8x4_halfpel_hv_3dn mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; stride - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j movq mm2, [eax] diff -urp xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_3dne.asm xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_3dne.asm --- xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_3dne.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_3dne.asm 2006-02-19 02:52:52.000000000 +0100 @@ -45,24 +45,6 @@ BITS 32 %endmacro ;============================================================================= -; Read only data -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 -mmx_one: - times 8 db 1 - -ALIGN 8 -mm_minusone: - dd -1,-1 - -;============================================================================= ; Macros ;============================================================================= @@ -149,7 +131,10 @@ interpolate8x8_halfpel_h_3dne: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 mov ecx, [esp+ 4] ; Dst - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 COPY_H_SSE_RND1 lea ecx, [ecx+2*edx] COPY_H_SSE_RND1 @@ -223,15 +208,15 @@ ALIGN 8 psubusb mm0, [eax] add eax, edx mov ecx, [esp+ 4] ; Dst - push esi + push byte -1 + push byte -1 pcmpeqb mm1, mm1 pcmpeqb mm2, mm2 - mov esi, mm_minusone psubusb mm1, [byte eax] psubusb mm2, [eax+edx] lea eax, [eax+2*edx] - movq mm6, [esi] - movq mm7, [esi] + movq mm6, [esp] + movq mm7, [esp] pavgb mm0, mm1 pavgb mm1, mm2 psubusb mm6, mm0 @@ -246,8 +231,8 @@ ALIGN 8 lea eax, [eax+2*edx] pavgb mm2, mm3 pavgb mm3, mm4 - movq mm0, [esi] - movq mm1, [esi] + movq mm0, [esp] + movq mm1, [esp] psubusb mm0, mm2 psubusb mm1, mm3 movq [ecx], mm0 @@ -261,8 +246,8 @@ ALIGN 8 lea eax, [eax+2*edx] pavgb mm4, mm5 pavgb mm5, mm6 - movq mm2, [esi] - movq mm3, [esi] + movq mm2, [esp] + movq mm3, [esp] psubusb mm2, mm4 psubusb mm3, mm5 movq [ecx], mm2 @@ -274,10 +259,10 @@ ALIGN 8 psubusb mm0, [eax+edx] pavgb mm6, mm7 pavgb mm7, mm0 - movq mm4, [esi] - movq mm5, [esi] + movq mm4, [esp] + movq mm5, [esp] psubusb mm4, mm6 - pop esi + add esp, byte 8 psubusb mm5, mm7 movq [ecx], mm4 movq [ecx+edx], mm5 @@ -391,7 +376,10 @@ interpolate8x8_halfpel_hv_3dne: pavgb mm2, mm3 pxor mm3, mm6 ; mm2/mm3 ready mov ecx, [esp+ 4] ; Dst - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 jz near .rounding1 lea ebp,[byte ebp] @@ -443,7 +431,10 @@ interpolate8x4_halfpel_h_3dne: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 mov ecx, [esp+ 4] ; Dst - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 COPY_H_SSE_RND1 lea ecx, [ecx+2*edx] COPY_H_SSE_RND1 @@ -501,16 +492,15 @@ ALIGN 8 add eax, edx ; eax==line1 mov ecx, [esp+ 4] ; Dst - push esi - pcmpeqb mm1, mm1 pcmpeqb mm2, mm2 - mov esi, mm_minusone + push byte -1 + push byte -1 psubusb mm1, [byte eax] ; line1 psubusb mm2, [eax+edx] ; line2 lea eax, [eax+2*edx] ; eax==line3 - movq mm6, [esi] - movq mm7, [esi] + movq mm6, [esp] + movq mm7, [esp] pavgb mm0, mm1 pavgb mm1, mm2 psubusb mm6, mm0 @@ -526,15 +516,13 @@ ALIGN 8 lea eax, [eax+2*edx] ; eax==line 5 pavgb mm2, mm3 pavgb mm3, mm4 - movq mm0, [esi] - movq mm1, [esi] + movq mm0, [esp] + movq mm1, [esp] psubusb mm0, mm2 psubusb mm1, mm3 movq [ecx], mm0 movq [ecx+edx], mm1 - pop esi - ret .endfunc @@ -562,7 +550,10 @@ interpolate8x4_halfpel_hv_3dne: pavgb mm2, mm3 pxor mm3, mm6 ; mm2/mm3 ready mov ecx, [esp+ 4] ; Dst - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 jz near .rounding1 lea ebp,[byte ebp] diff -urp xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_mmx.asm xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_mmx.asm --- xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_mmx.asm 2006-02-19 03:14:03.000000000 +0100 @@ -166,13 +166,17 @@ interpolate8x8_halfpel_h_mmx: push esi push edi - mov eax, [esp + 8 + 16] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc - movq mm7, [rounding1_mmx + eax * 8] + mov eax, [esp + 12 + 16] ; rounding - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride + movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff] + + mov edi, [esp + 12 + 4] ; dst + mov esi, [esp + 12 + 8] ; src + mov edx, [esp + 12 + 12] ; stride pxor mm6, mm6 ; zero @@ -185,6 +189,7 @@ interpolate8x8_halfpel_h_mmx: COPY_H_MMX COPY_H_MMX + pop ebp pop edi pop esi @@ -225,13 +230,17 @@ interpolate8x8_halfpel_v_mmx: push esi push edi - mov eax, [esp + 8 + 16] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp + 12 + 16] ; rounding - movq mm7, [rounding1_mmx + eax * 8] + movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff] - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride + mov edi, [esp + 12 + 4] ; dst + mov esi, [esp + 12 + 8] ; src + mov edx, [esp + 12 + 12] ; stride pxor mm6, mm6 ; zero @@ -245,6 +254,7 @@ interpolate8x8_halfpel_v_mmx: COPY_V_MMX COPY_V_MMX + pop ebp pop edi pop esi @@ -315,18 +325,22 @@ interpolate8x8_halfpel_hv_mmx: push esi push edi - mov eax, [esp + 8 + 16] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc - movq mm7, [rounding2_mmx + eax * 8] + mov eax, [esp + 12 + 16] ; rounding - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src + movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff] + + mov edi, [esp + 12 + 4] ; dst + mov esi, [esp + 12 + 8] ; src mov eax, 8 pxor mm6, mm6 ; zero - mov edx, [esp + 8 + 12] ; stride + mov edx, [esp + 12 + 12] ; stride COPY_HV_MMX COPY_HV_MMX @@ -337,6 +351,7 @@ interpolate8x8_halfpel_hv_mmx: COPY_HV_MMX COPY_HV_MMX + pop ebp pop edi pop esi @@ -357,13 +372,18 @@ interpolate8x4_halfpel_h_mmx: push esi push edi - mov eax, [esp + 8 + 16] ; rounding - movq mm7, [rounding1_mmx + eax * 8] + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp + 12 + 16] ; rounding - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride + movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff] + + mov edi, [esp + 12 + 4] ; dst + mov esi, [esp + 12 + 8] ; src + mov edx, [esp + 12 + 12] ; stride pxor mm6, mm6 ; zero @@ -372,6 +392,7 @@ interpolate8x4_halfpel_h_mmx: COPY_H_MMX COPY_H_MMX + pop ebp pop edi pop esi @@ -394,13 +415,17 @@ interpolate8x4_halfpel_v_mmx: push esi push edi - mov eax, [esp + 8 + 16] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp + 12 + 16] ; rounding - movq mm7, [rounding1_mmx + eax * 8] + movq mm7, [ebp + rounding1_mmx + eax * 8 wrt ..gotoff] - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src - mov edx, [esp + 8 + 12] ; stride + mov edi, [esp + 12 + 4] ; dst + mov esi, [esp + 12 + 8] ; src + mov edx, [esp + 12 + 12] ; stride pxor mm6, mm6 ; zero @@ -410,6 +435,7 @@ interpolate8x4_halfpel_v_mmx: COPY_V_MMX COPY_V_MMX + pop ebp pop edi pop esi @@ -433,24 +459,29 @@ interpolate8x4_halfpel_hv_mmx: push esi push edi - mov eax, [esp + 8 + 16] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc - movq mm7, [rounding2_mmx + eax * 8] + mov eax, [esp + 12 + 16] ; rounding - mov edi, [esp + 8 + 4] ; dst - mov esi, [esp + 8 + 8] ; src + movq mm7, [ebp + rounding2_mmx + eax * 8 wrt ..gotoff] + + mov edi, [esp + 12 + 4] ; dst + mov esi, [esp + 12 + 8] ; src mov eax, 8 pxor mm6, mm6 ; zero - mov edx, [esp + 8 + 12] ; stride + mov edx, [esp + 12 + 12] ; stride COPY_HV_MMX COPY_HV_MMX COPY_HV_MMX COPY_HV_MMX + pop ebp pop edi pop esi @@ -491,10 +522,10 @@ interpolate8x4_halfpel_hv_mmx: por mm3, mm6 - pand mm0, [mmx_mask] - pand mm1, [mmx_mask] - pand mm4, [mmx_mask] - pand mm5, [mmx_mask] + pand mm0, [ebp + mmx_mask wrt ..gotoff] + pand mm1, [ebp + mmx_mask wrt ..gotoff] + pand mm4, [ebp + mmx_mask wrt ..gotoff] + pand mm5, [ebp + mmx_mask wrt ..gotoff] psrlq mm0, 1 ; src1 / 2 psrlq mm1, 1 ; src2 / 2 @@ -538,10 +569,10 @@ interpolate8x4_halfpel_hv_mmx: pand mm3, mm6 - pand mm0, [mmx_mask] - pand mm1, [mmx_mask] - pand mm4, [mmx_mask] - pand mm5, [mmx_mask] + pand mm0, [ebp + mmx_mask wrt ..gotoff] + pand mm1, [ebp + mmx_mask wrt ..gotoff] + pand mm4, [ebp + mmx_mask wrt ..gotoff] + pand mm5, [ebp + mmx_mask wrt ..gotoff] psrlq mm0, 1 ; src1 / 2 psrlq mm1, 1 ; src2 / 2 @@ -567,21 +598,25 @@ interpolate8x8_avg2_mmx: push ebx - mov eax, [esp + 4 + 20] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp + 8 + 20] ; rounding test eax, eax jnz near .rounding1 - mov eax, [esp + 4 + 24] ; height -> eax + mov eax, [esp + 8 + 24] ; height -> eax sub eax, 8 test eax, eax - mov ecx, [esp + 4 + 4] ; dst -> edi - mov eax, [esp + 4 + 8] ; src1 -> esi - mov ebx, [esp + 4 + 12] ; src2 -> eax - mov edx, [esp + 4 + 16] ; stride -> edx + mov ecx, [esp + 8 + 4] ; dst -> edi + mov eax, [esp + 8 + 8] ; src1 -> esi + mov ebx, [esp + 8 + 12] ; src2 -> eax + mov edx, [esp + 8 + 16] ; stride -> edx - movq mm7, [mmx_one] + movq mm7, [ebp + mmx_one wrt ..gotoff] jz near .start0 @@ -602,16 +637,16 @@ interpolate8x8_avg2_mmx: ret .rounding1 - mov eax, [esp + 4 + 24] ; height -> eax + mov eax, [esp + 8 + 24] ; height -> eax sub eax, 8 test eax, eax - mov ecx, [esp + 4 + 4] ; dst -> edi - mov eax, [esp + 4 + 8] ; src1 -> esi - mov ebx, [esp + 4 + 12] ; src2 -> eax - mov edx, [esp + 4 + 16] ; stride -> edx + mov ecx, [esp + 8 + 4] ; dst -> edi + mov eax, [esp + 8 + 8] ; src1 -> esi + mov ebx, [esp + 8 + 12] ; src2 -> eax + mov edx, [esp + 8 + 16] ; stride -> edx - movq mm7, [mmx_one] + movq mm7, [ebp + mmx_one wrt ..gotoff] jz near .start1 @@ -628,6 +663,7 @@ interpolate8x8_avg2_mmx: lea ecx, [ecx+2*edx] AVG2_MMX_RND1 + pop ebp pop ebx ret .endfunc @@ -652,11 +688,11 @@ interpolate8x8_avg2_mmx: movq mm2, mm0 movq mm3, mm1 - pand mm2, [mmx_three] - pand mm3, [mmx_three] + pand mm2, [ebp + mmx_three wrt ..gotoff] + pand mm3, [ebp + mmx_three wrt ..gotoff] - pand mm0, [mmx_mask2] - pand mm1, [mmx_mask2] + pand mm0, [ebp + mmx_mask2 wrt ..gotoff] + pand mm1, [ebp + mmx_mask2 wrt ..gotoff] psrlq mm0, 2 psrlq mm1, 2 @@ -673,11 +709,11 @@ interpolate8x8_avg2_mmx: movq mm1, mm4 movq mm3, mm5 - pand mm1, [mmx_three] - pand mm3, [mmx_three] + pand mm1, [ebp + mmx_three wrt ..gotoff] + pand mm3, [ebp + mmx_three wrt ..gotoff] - pand mm4, [mmx_mask2] - pand mm5, [mmx_mask2] + pand mm4, [ebp + mmx_mask2 wrt ..gotoff] + pand mm5, [ebp + mmx_mask2 wrt ..gotoff] psrlq mm4, 2 psrlq mm5, 2 @@ -688,8 +724,8 @@ interpolate8x8_avg2_mmx: paddb mm1, mm3 paddb mm2, mm1 - paddb mm2, [mmx_two] - pand mm2, [mmx_mask2] + paddb mm2, [ebp + mmx_two wrt ..gotoff] + pand mm2, [ebp + mmx_mask2 wrt ..gotoff] psrlq mm2, 2 paddb mm0, mm2 @@ -707,11 +743,11 @@ interpolate8x8_avg2_mmx: movq mm2, mm0 movq mm3, mm1 - pand mm2, [mmx_three] - pand mm3, [mmx_three] + pand mm2, [ebp + mmx_three wrt ..gotoff] + pand mm3, [ebp + mmx_three wrt ..gotoff] - pand mm0, [mmx_mask2] - pand mm1, [mmx_mask2] + pand mm0, [ebp + mmx_mask2 wrt ..gotoff] + pand mm1, [ebp + mmx_mask2 wrt ..gotoff] psrlq mm0, 2 psrlq mm1, 2 @@ -728,11 +764,11 @@ interpolate8x8_avg2_mmx: movq mm1, mm4 movq mm3, mm5 - pand mm1, [mmx_three] - pand mm3, [mmx_three] + pand mm1, [ebp + mmx_three wrt ..gotoff] + pand mm3, [ebp + mmx_three wrt ..gotoff] - pand mm4, [mmx_mask2] - pand mm5, [mmx_mask2] + pand mm4, [ebp + mmx_mask2 wrt ..gotoff] + pand mm5, [ebp + mmx_mask2 wrt ..gotoff] psrlq mm4, 2 psrlq mm5, 2 @@ -743,8 +779,8 @@ interpolate8x8_avg2_mmx: paddb mm1, mm3 paddb mm2, mm1 - paddb mm2, [mmx_one] - pand mm2, [mmx_mask2] + paddb mm2, [ebp + mmx_one wrt ..gotoff] + pand mm2, [ebp + mmx_mask2 wrt ..gotoff] psrlq mm2, 2 paddb mm0, mm2 @@ -762,18 +798,22 @@ interpolate8x8_avg4_mmx: push edi push esi - mov eax, [esp + 12 + 28] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp + 16 + 28] ; rounding test eax, eax - mov ecx, [esp + 12 + 4] ; dst -> edi - mov eax, [esp + 12 + 8] ; src1 -> esi - mov ebx, [esp + 12 + 12] ; src2 -> eax - mov esi, [esp + 12 + 16] ; src3 -> esi - mov edi, [esp + 12 + 20] ; src4 -> edi - mov edx, [esp + 12 + 24] ; stride -> edx + mov ecx, [esp + 16 + 4] ; dst -> edi + mov eax, [esp + 16 + 8] ; src1 -> esi + mov ebx, [esp + 16 + 12] ; src2 -> eax + mov esi, [esp + 16 + 16] ; src3 -> esi + mov edi, [esp + 16 + 20] ; src4 -> edi + mov edx, [esp + 16 + 24] ; stride -> edx - movq mm7, [mmx_one] + movq mm7, [ebp + mmx_one wrt ..gotoff] jnz near .rounding1 @@ -815,6 +855,7 @@ interpolate8x8_avg4_mmx: lea ecx, [ecx+edx] AVG4_MMX_RND1 + pop ebp pop esi pop edi pop ebx @@ -868,8 +909,8 @@ interpolate8x8_avg4_mmx: psubsw mm0, mm2 psubsw mm1, mm3 - pmullw mm0, [mmx_five] - pmullw mm1, [mmx_five] + pmullw mm0, [ebp + mmx_five wrt ..gotoff] + pmullw mm1, [ebp + mmx_five wrt ..gotoff] movq mm2, [eax-2] movq mm4, [eax+3] @@ -903,13 +944,17 @@ interpolate8x8_avg4_mmx: ALIGN 16 interpolate8x8_6tap_lowpass_h_mmx: - mov eax, [esp + 16] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov eax, [esp + 20] ; rounding - movq mm6, [rounding_lowpass_mmx + eax * 8] + movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff] - mov ecx, [esp + 4] ; dst -> edi - mov eax, [esp + 8] ; src -> esi - mov edx, [esp + 12] ; stride -> edx + mov ecx, [esp + 8] ; dst -> edi + mov eax, [esp + 12] ; src -> esi + mov edx, [esp + 16] ; stride -> edx pxor mm7, mm7 @@ -929,6 +974,7 @@ interpolate8x8_6tap_lowpass_h_mmx: lea ecx, [ecx+edx] LOWPASS_6TAP_H_MMX + pop ebp ret .endfunc @@ -979,8 +1025,8 @@ interpolate8x8_6tap_lowpass_h_mmx: psubsw mm0, mm2 psubsw mm1, mm3 - pmullw mm0, [mmx_five] - pmullw mm1, [mmx_five] + pmullw mm0, [ebp + mmx_five wrt ..gotoff] + pmullw mm1, [ebp + mmx_five wrt ..gotoff] movq mm2, [eax+edx] movq mm4, [eax+2*ebx] @@ -1016,13 +1062,17 @@ interpolate8x8_6tap_lowpass_v_mmx: push ebx - mov eax, [esp + 4 + 16] ; rounding + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc - movq mm6, [rounding_lowpass_mmx + eax * 8] + mov eax, [esp + 8 + 16] ; rounding - mov ecx, [esp + 4 + 4] ; dst -> edi - mov eax, [esp + 4 + 8] ; src -> esi - mov edx, [esp + 4 + 12] ; stride -> edx + movq mm6, [ebp + rounding_lowpass_mmx + eax * 8 wrt ..gotoff] + + mov ecx, [esp + 8 + 4] ; dst -> edi + mov eax, [esp + 8 + 8] ; src -> esi + mov edx, [esp + 8 + 12] ; stride -> edx mov ebx, edx shl ebx, 1 @@ -1046,6 +1096,7 @@ interpolate8x8_6tap_lowpass_v_mmx: lea ecx, [ecx+edx] LOWPASS_6TAP_V_MMX + pop ebp pop ebx ret .endfunc @@ -1066,12 +1117,17 @@ interpolate8x8_6tap_lowpass_v_mmx: %macro PROLOG 2 ; %1: Rounder, %2 load Dst-Rounder pxor mm6, mm6 - movq mm7, [%1] ; TODO: dangerous! (eax isn't checked) + PROLOG0 + + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + %if %2 - movq mm5, [rounding1_mmx] + movq mm5, [ebp + rounding1_mmx wrt ..gotoff] %endif - PROLOG0 + movq mm7, [ebp + %1 wrt ..gotoff] ; TODO: dangerous! (eax isn't checked) %endmacro ; performs: mm0 == (mm0+mm2) mm1 == (mm1+mm3) @@ -1160,6 +1216,7 @@ interpolate8x8_halfpel_add_mmx: ADD_FF_MMX 1 ADD_FF_MMX 1 ADD_FF_MMX 0 + pop ebp ret .endfunc @@ -1206,6 +1263,7 @@ interpolate8x8_halfpel_h_add_mmx: ADD_FH_MMX lea ecx,[ecx+edx] ADD_FH_MMX + pop ebp ret .endfunc @@ -1253,6 +1311,7 @@ interpolate8x8_halfpel_v_add_mmx: ADD_HF_MMX lea ecx,[ecx+edx] ADD_HF_MMX + pop ebp ret .endfunc @@ -1318,8 +1377,8 @@ interpolate8x8_halfpel_v_add_mmx: paddusw mm0, mm4 ; mix Src(mm0/mm1) with Dst(mm2/mm3) paddusw mm1, mm5 - paddusw mm0, [rounding1_mmx] - paddusw mm1, [rounding1_mmx] + paddusw mm0, [ebp + rounding1_mmx wrt ..gotoff] + paddusw mm1, [ebp + rounding1_mmx wrt ..gotoff] psrlw mm0, 1 psrlw mm1, 1 @@ -1329,6 +1388,11 @@ interpolate8x8_halfpel_v_add_mmx: movq [ecx], mm0 %endmacro +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + ALIGN 16 interpolate8x8_halfpel_hv_add_mmx: PROLOG rounding2_mmx, 0 ; mm5 is busy. Don't load dst-rounder @@ -1364,6 +1428,7 @@ interpolate8x8_halfpel_hv_add_mmx: lea ecx,[ecx+edx] ADD_HH_MMX + pop ebp ret .endfunc diff -urp xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_xmm.asm xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_xmm.asm --- xvidcore-1.1.0-old/src/image/x86_asm/interpolate8x8_xmm.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/interpolate8x8_xmm.asm 2006-02-19 03:04:35.000000000 +0100 @@ -42,20 +42,6 @@ BITS 32 %endif %endmacro -;============================================================================= -; Read only data -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 -mmx_one: - times 8 db 1 - SECTION .text cglobal interpolate8x8_halfpel_h_xmm @@ -132,7 +118,10 @@ interpolate8x8_halfpel_h_xmm: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 COPY_H_SSE_RND1 lea ecx, [ecx+2*edx] COPY_H_SSE_RND1 @@ -204,7 +193,10 @@ interpolate8x8_halfpel_v_xmm: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 movq mm2, [eax] ; loop invariant add eax, edx @@ -326,7 +318,10 @@ interpolate8x8_halfpel_hv_xmm: mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; stride - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j movq mm2, [eax] @@ -384,7 +379,10 @@ interpolate8x4_halfpel_h_xmm: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 COPY_H_SSE_RND1 lea ecx, [ecx+2*edx] COPY_H_SSE_RND1 @@ -419,7 +417,10 @@ interpolate8x4_halfpel_v_xmm: .rounding1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 movq mm2, [eax] ; loop invariant add eax, edx @@ -458,7 +459,10 @@ interpolate8x4_halfpel_hv_xmm: mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; stride - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j movq mm2, [eax] @@ -583,8 +587,8 @@ interpolate8x8_halfpel_add_xmm: ; 23c pxor mm2, mm4 pavgb mm1, mm3 pxor mm3, mm5 - pand mm2, [mmx_one] - pand mm3, [mmx_one] + pand mm2, [esp] + pand mm3, [esp] psubb mm0, mm2 psubb mm1, mm3 pavgb mm0, [ecx+%1] @@ -612,6 +616,8 @@ interpolate8x8_halfpel_h_add_xmm: ; 32 .Loop1 ; we use: (i+j)/2 = ( i+j+1 )/2 - (i^j)&1 ; movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 ADD_FH_RND1 0, edx lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] @@ -622,6 +628,7 @@ interpolate8x8_halfpel_h_add_xmm: ; 32 lea eax,[eax+2*edx] lea ecx,[ecx+2*edx] ADD_FH_RND1 0, edx + add esp, byte 8 EPILOG .endfunc @@ -686,7 +693,10 @@ interpolate8x8_halfpel_v_add_xmm: .Loop1 movq mm0, [eax] ; loop invariant - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] + add esp, byte 8 ADD_8_HF_RND1 movq mm0, mm2 @@ -809,7 +819,9 @@ ALIGN 16 interpolate8x8_halfpel_hv_add_xmm: PROLOG1 - movq mm7, [mmx_one] + push dword 0x01010101 + push dword 0x01010101 + movq mm7, [esp] ; loop invariants: mm2=(i+j+1)/2 and mm3= i^j movq mm2, [eax] @@ -838,6 +850,7 @@ interpolate8x8_halfpel_hv_add_xmm: add ecx, edx ADD_HH_RND1 + add esp, byte 8 EPILOG .endfunc diff -urp xvidcore-1.1.0-old/src/image/x86_asm/postprocessing_mmx.asm xvidcore-1.1.0/src/image/x86_asm/postprocessing_mmx.asm --- xvidcore-1.1.0-old/src/image/x86_asm/postprocessing_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/postprocessing_mmx.asm 2006-02-19 01:49:09.000000000 +0100 @@ -70,6 +70,11 @@ mmx_offset: SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + cglobal image_brightness_mmx @@ -83,16 +88,19 @@ image_brightness_mmx: push esi push edi - movq mm6, [mmx_0x80] - mov eax, [esp+8+20] ; offset - movq mm7, [mmx_offset + (eax + 128)*8] ; being lazy - mov edx, [esp+8+4] ; Dst mov ecx, [esp+8+8] ; stride mov esi, [esp+8+12] ; width mov edi, [esp+8+16] ; height + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + movq mm6, [ebp + mmx_0x80 wrt ..gotoff] + movq mm7, [ebp + mmx_offset + (eax + 128)*8 wrt ..gotoff] ; being lazy + pop ebp + .yloop xor eax, eax diff -urp xvidcore-1.1.0-old/src/image/x86_asm/postprocessing_sse2.asm xvidcore-1.1.0/src/image/x86_asm/postprocessing_sse2.asm --- xvidcore-1.1.0-old/src/image/x86_asm/postprocessing_sse2.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/postprocessing_sse2.asm 2006-02-19 01:49:09.000000000 +0100 @@ -42,19 +42,6 @@ BITS 32 %endif %endmacro -;=========================================================================== -; read only data -;=========================================================================== - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -xmm_0x80: - times 16 db 0x80 - ;============================================================================= ; Code ;============================================================================= @@ -93,7 +80,12 @@ image_brightness_sse2: push edi ; 8 bytes offset for push sub esp, 32 ; 32 bytes for local data (16bytes will be used, 16bytes more to align correctly mod 16) - movdqa xmm6, [xmm_0x80] + push dword 0x80808080 + push dword 0x80808080 + push dword 0x80808080 + push dword 0x80808080 + movdqa xmm6, [esp] + add esp, byte 16 ; Create a offset...offset vector mov eax, [esp+8+32+20] ; brightness offset value diff -urp xvidcore-1.1.0-old/src/image/x86_asm/qpel_mmx.asm xvidcore-1.1.0/src/image/x86_asm/qpel_mmx.asm --- xvidcore-1.1.0-old/src/image/x86_asm/qpel_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/qpel_mmx.asm 2006-02-19 01:49:09.000000000 +0100 @@ -201,6 +201,11 @@ FIR_C23: times 4 dw 23 SECTION .text +extern _GLOBAL_OFFSET_TABLE_ +get_pc.cx: + mov ecx, [esp] + retn + ;////////////////////////////////////////////////////////////////////// ;// Here we go with the Q-Pel mess. ;// For horizontal passes, we process 4 *output* pixel in parallel @@ -208,19 +213,22 @@ SECTION .text ;////////////////////////////////////////////////////////////////////// %macro PROLOG_NO_AVRG 0 + push ebx push esi push edi push ebp - mov edi, [esp+16 + 0*4] ; Dst - mov esi, [esp+16 + 1*4] ; Src - mov ecx, [esp+16 + 2*4] ; Size - mov ebp, [esp+16 + 3*4] ; BpS - mov eax, [esp+16 + 4*4] ; Rnd + mov edi, [esp+20 + 0*4] ; Dst + mov esi, [esp+20 + 1*4] ; Src + mov ebp, [esp+20 + 3*4] ; BpS + mov eax, [esp+20 + 4*4] ; Rnd and eax, 1 - movq mm7, [Rounder_QP_MMX+eax*8] ; rounder + call get_pc.cx + add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff] ; rounder %endmacro %macro EPILOG_NO_AVRG 0 + pop ebx pop ebp pop edi pop esi @@ -234,12 +242,13 @@ SECTION .text push ebp mov edi, [esp+20 + 0*4] ; Dst mov esi, [esp+20 + 1*4] ; Src - mov ecx, [esp+20 + 2*4] ; Size mov ebp, [esp+20 + 3*4] ; BpS mov eax, [esp+20 + 4*4] ; Rnd and eax, 1 - movq mm7, [Rounder_QP_MMX+eax*8] ; rounder - lea ebx, [Rounder1_MMX+eax*8] ; *Rounder2 + call get_pc.cx + add ecx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + movq mm7, [ecx + Rounder_QP_MMX+eax*8 wrt ..gotoff] ; rounder + lea ebx, [ecx + Rounder1_MMX+eax*8 wrt ..gotoff] ; *Rounder2 %endmacro %macro EPILOG_AVRG 0 @@ -261,23 +270,23 @@ SECTION .text %macro TLOAD 2 ; %1,%2: src pixels movzx eax, byte [esi+%1] movzx edx, byte [esi+%2] - movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] - movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] + movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff] + movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff] paddw mm0, mm7 paddw mm3, mm7 %endmacro %macro TACCUM2 5 ;%1:src pixel/%2-%3:Taps tables/ %4-%5:dst regs movzx eax, byte [esi+%1] - paddw %4, [%2 + eax*8] - paddw %5, [%3 + eax*8] + paddw %4, [eax*8 + %2] + paddw %5, [eax*8 + %3] %endmacro %macro TACCUM3 7 ;%1:src pixel/%2-%4:Taps tables/%5-%7:dst regs movzx eax, byte [esi+%1] - paddw %5, [%2 + eax*8] - paddw %6, [%3 + eax*8] - paddw %7, [%4 + eax*8] + paddw %5, [eax*8 + %2] + paddw %6, [eax*8 + %3] + paddw %7, [eax*8 + %4] %endmacro ;////////////////////////////////////////////////////////////////////// @@ -287,32 +296,32 @@ SECTION .text %macro LOAD 2 ; %1,%2: src pixels movzx eax, byte [esi+%1] movzx edx, byte [esi+%2] - movq mm0, [xvid_Expand_mmx + eax*8] - movq mm3, [xvid_Expand_mmx + edx*8] - pmullw mm0, [FIR_R0 ] - pmullw mm3, [FIR_R16] + movq mm0, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff] + movq mm3, [ecx + xvid_Expand_mmx + edx*8 wrt ..gotoff] + pmullw mm0, [ecx + FIR_R0 wrt ..gotoff] + pmullw mm3, [ecx + FIR_R16 wrt ..gotoff] paddw mm0, mm7 paddw mm3, mm7 %endmacro %macro ACCUM2 4 ;src pixel/Taps/dst regs #1-#2 movzx eax, byte [esi+%1] - movq mm4, [xvid_Expand_mmx + eax*8] + movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff] movq mm5, mm4 pmullw mm4, [%2] - pmullw mm5, [%2+8] + pmullw mm5, [8+%2] paddw %3, mm4 paddw %4, mm5 %endmacro %macro ACCUM3 5 ;src pixel/Taps/dst regs #1-#2-#3 movzx eax, byte [esi+%1] - movq mm4, [xvid_Expand_mmx + eax*8] + movq mm4, [ecx + xvid_Expand_mmx + eax*8 wrt ..gotoff] movq mm5, mm4 movq mm6, mm5 - pmullw mm4, [%2 ] - pmullw mm5, [%2+ 8] - pmullw mm6, [%2+16] + pmullw mm4, [ %2] + pmullw mm5, [ 8+%2] + pmullw mm6, [16+%2] paddw %3, mm4 paddw %4, mm5 paddw %5, mm6 @@ -359,23 +368,23 @@ SECTION .text movq mm1, mm7 movq mm2, mm7 - ACCUM2 1, FIR_R1, mm0, mm1 - ACCUM2 2, FIR_R2, mm0, mm1 - ACCUM2 3, FIR_R3, mm0, mm1 - ACCUM2 4, FIR_R4, mm0, mm1 - - ACCUM3 5, FIR_R5, mm0, mm1, mm2 - ACCUM3 6, FIR_R6, mm0, mm1, mm2 - ACCUM3 7, FIR_R7, mm0, mm1, mm2 - ACCUM2 8, FIR_R8, mm1, mm2 - ACCUM3 9, FIR_R9, mm1, mm2, mm3 - ACCUM3 10, FIR_R10,mm1, mm2, mm3 - ACCUM3 11, FIR_R11,mm1, mm2, mm3 - - ACCUM2 12, FIR_R12, mm2, mm3 - ACCUM2 13, FIR_R13, mm2, mm3 - ACCUM2 14, FIR_R14, mm2, mm3 - ACCUM2 15, FIR_R15, mm2, mm3 + ACCUM2 1, ecx + FIR_R1 wrt ..gotoff, mm0, mm1 + ACCUM2 2, ecx + FIR_R2 wrt ..gotoff, mm0, mm1 + ACCUM2 3, ecx + FIR_R3 wrt ..gotoff, mm0, mm1 + ACCUM2 4, ecx + FIR_R4 wrt ..gotoff, mm0, mm1 + + ACCUM3 5, ecx + FIR_R5 wrt ..gotoff, mm0, mm1, mm2 + ACCUM3 6, ecx + FIR_R6 wrt ..gotoff, mm0, mm1, mm2 + ACCUM3 7, ecx + FIR_R7 wrt ..gotoff, mm0, mm1, mm2 + ACCUM2 8, ecx + FIR_R8 wrt ..gotoff, mm1, mm2 + ACCUM3 9, ecx + FIR_R9 wrt ..gotoff, mm1, mm2, mm3 + ACCUM3 10, ecx + FIR_R10 wrt ..gotoff,mm1, mm2, mm3 + ACCUM3 11, ecx + FIR_R11 wrt ..gotoff,mm1, mm2, mm3 + + ACCUM2 12, ecx + FIR_R12 wrt ..gotoff, mm2, mm3 + ACCUM2 13, ecx + FIR_R13 wrt ..gotoff, mm2, mm3 + ACCUM2 14, ecx + FIR_R14 wrt ..gotoff, mm2, mm3 + ACCUM2 15, ecx + FIR_R15 wrt ..gotoff, mm2, mm3 %else @@ -383,25 +392,25 @@ SECTION .text movq mm1, mm7 movq mm2, mm7 - TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm1 - TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1 - TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1 - TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1, mm0, mm1 - - TACCUM3 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0 , mm0, mm1, mm2 - TACCUM3 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm1, mm2 - TACCUM3 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm1, mm2 - - TACCUM2 8, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm1, mm2 - - TACCUM3 9, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, xvid_FIR_1_0_0_0, mm1, mm2, mm3 - TACCUM3 10, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_6, xvid_FIR_3_1_0_0, mm1, mm2, mm3 - TACCUM3 11, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0, mm1, mm2, mm3 - - TACCUM2 12, xvid_FIR_1_3_6_20, xvid_FIR_20_6_3_1 , mm2, mm3 - TACCUM2 13, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm2, mm3 - TACCUM2 14, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm2, mm3 - TACCUM2 15, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm2, mm3 + TACCUM2 1, ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1 + TACCUM2 2, ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1 + TACCUM2 3, ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1 + TACCUM2 4, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff, mm0, mm1 + + TACCUM3 5, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm1, mm2 + TACCUM3 6, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm1, mm2 + TACCUM3 7, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm1, mm2 + + TACCUM2 8, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , xvid_FIR_20_6_3_1 wrt ..gotoff , mm1, mm2 + + TACCUM3 9, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff, mm1, mm2, mm3 + TACCUM3 10, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff, mm1, mm2, mm3 + TACCUM3 11, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff, mm1, mm2, mm3 + + TACCUM2 12, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff, ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm2, mm3 + TACCUM2 13, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm2, mm3 + TACCUM2 14, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm2, mm3 + TACCUM2 15, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm2, mm3 %endif @@ -418,7 +427,7 @@ SECTION .text MIX mm0, esi+1, ebx %endif %if (%2==1) - MIX mm0, edi, Rounder1_MMX + MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff %endif %if (%1==1) @@ -427,7 +436,7 @@ SECTION .text MIX mm2, esi+9, ebx %endif %if (%2==1) - MIX mm2, edi+8, Rounder1_MMX + MIX mm2, edi+8, ecx + Rounder1_MMX wrt ..gotoff %endif lea esi, [esi+ebp] @@ -436,7 +445,7 @@ SECTION .text movq [edi+8], mm2 add edi, ebp - dec ecx + dec dword [esp+20 + 2*4] jg .Loop %if (%2==0) && (%1==0) @@ -464,64 +473,64 @@ SECTION .text %ifndef USE_TABLES LOAD 0, 8 ; special case for 1rst/last pixel - ACCUM2 1, FIR_R1, mm0, mm3 - ACCUM2 2, FIR_R2, mm0, mm3 - ACCUM2 3, FIR_R3, mm0, mm3 - ACCUM2 4, FIR_R4, mm0, mm3 - - ACCUM2 5, FIR_R13, mm0, mm3 - ACCUM2 6, FIR_R14, mm0, mm3 - ACCUM2 7, FIR_R15, mm0, mm3 + ACCUM2 1, ecx + FIR_R1 wrt ..gotoff, mm0, mm3 + ACCUM2 2, ecx + FIR_R2 wrt ..gotoff, mm0, mm3 + ACCUM2 3, ecx + FIR_R3 wrt ..gotoff, mm0, mm3 + ACCUM2 4, ecx + FIR_R4 wrt ..gotoff, mm0, mm3 + + ACCUM2 5, ecx + FIR_R13 wrt ..gotoff, mm0, mm3 + ACCUM2 6, ecx + FIR_R14 wrt ..gotoff, mm0, mm3 + ACCUM2 7, ecx + FIR_R15 wrt ..gotoff, mm0, mm3 %else %if 0 ; test with no unrolling TLOAD 0, 8 ; special case for 1rst/last pixel - TACCUM2 1, xvid_FIR_23_19_6_3, xvid_FIR_1_0_0_0 , mm0, mm3 - TACCUM2 2, xvid_FIR_7_20_20_6, xvid_FIR_3_1_0_0 , mm0, mm3 - TACCUM2 3, xvid_FIR_3_6_20_20, xvid_FIR_6_3_1_0 , mm0, mm3 - TACCUM2 4, xvid_FIR_1_3_6_20 , xvid_FIR_20_6_3_1 , mm0, mm3 - TACCUM2 5, xvid_FIR_0_1_3_6 , xvid_FIR_20_20_6_3, mm0, mm3 - TACCUM2 6, xvid_FIR_0_0_1_3 , xvid_FIR_6_20_20_7, mm0, mm3 - TACCUM2 7, xvid_FIR_0_0_0_1 , xvid_FIR_3_6_19_23, mm0, mm3 + TACCUM2 1, ecx + xvid_FIR_23_19_6_3 wrt ..gotoff, ecx + xvid_FIR_1_0_0_0 wrt ..gotoff , mm0, mm3 + TACCUM2 2, ecx + xvid_FIR_7_20_20_6 wrt ..gotoff, ecx + xvid_FIR_3_1_0_0 wrt ..gotoff , mm0, mm3 + TACCUM2 3, ecx + xvid_FIR_3_6_20_20 wrt ..gotoff, ecx + xvid_FIR_6_3_1_0 wrt ..gotoff , mm0, mm3 + TACCUM2 4, ecx + xvid_FIR_1_3_6_20 wrt ..gotoff , ecx + xvid_FIR_20_6_3_1 wrt ..gotoff , mm0, mm3 + TACCUM2 5, ecx + xvid_FIR_0_1_3_6 wrt ..gotoff , ecx + xvid_FIR_20_20_6_3 wrt ..gotoff, mm0, mm3 + TACCUM2 6, ecx + xvid_FIR_0_0_1_3 wrt ..gotoff , ecx + xvid_FIR_6_20_20_7 wrt ..gotoff, mm0, mm3 + TACCUM2 7, ecx + xvid_FIR_0_0_0_1 wrt ..gotoff , ecx + xvid_FIR_3_6_19_23 wrt ..gotoff, mm0, mm3 %else ; test with unrolling (little faster, but not much) movzx eax, byte [esi] movzx edx, byte [esi+8] - movq mm0, [xvid_FIR_14_3_2_1 + eax*8 ] + movq mm0, [ecx + xvid_FIR_14_3_2_1 + eax*8 wrt ..gotoff] movzx eax, byte [esi+1] - movq mm3, [xvid_FIR_1_2_3_14 + edx*8 ] + movq mm3, [ecx + xvid_FIR_1_2_3_14 + edx*8 wrt ..gotoff] paddw mm0, mm7 paddw mm3, mm7 movzx edx, byte [esi+2] - paddw mm0, [xvid_FIR_23_19_6_3 + eax*8] - paddw mm3, [xvid_FIR_1_0_0_0 + eax*8] + paddw mm0, [ecx + xvid_FIR_23_19_6_3 + eax*8 wrt ..gotoff] + paddw mm3, [ecx + xvid_FIR_1_0_0_0 + eax*8 wrt ..gotoff] movzx eax, byte [esi+3] - paddw mm0, [xvid_FIR_7_20_20_6 + edx*8] - paddw mm3, [xvid_FIR_3_1_0_0 + edx*8] + paddw mm0, [ecx + xvid_FIR_7_20_20_6 + edx*8 wrt ..gotoff] + paddw mm3, [ecx + xvid_FIR_3_1_0_0 + edx*8 wrt ..gotoff] movzx edx, byte [esi+4] - paddw mm0, [xvid_FIR_3_6_20_20 + eax*8] - paddw mm3, [xvid_FIR_6_3_1_0 + eax*8] + paddw mm0, [ecx + xvid_FIR_3_6_20_20 + eax*8 wrt ..gotoff] + paddw mm3, [ecx + xvid_FIR_6_3_1_0 + eax*8 wrt ..gotoff] movzx eax, byte [esi+5] - paddw mm0, [xvid_FIR_1_3_6_20 + edx*8] - paddw mm3, [xvid_FIR_20_6_3_1 + edx*8] + paddw mm0, [ecx + xvid_FIR_1_3_6_20 + edx*8 wrt ..gotoff] + paddw mm3, [ecx + xvid_FIR_20_6_3_1 + edx*8 wrt ..gotoff] movzx edx, byte [esi+6] - paddw mm0, [xvid_FIR_0_1_3_6 + eax*8] - paddw mm3, [xvid_FIR_20_20_6_3 + eax*8] + paddw mm0, [ecx + xvid_FIR_0_1_3_6 + eax*8 wrt ..gotoff] + paddw mm3, [ecx + xvid_FIR_20_20_6_3 + eax*8 wrt ..gotoff] movzx eax, byte [esi+7] - paddw mm0, [xvid_FIR_0_0_1_3 + edx*8] - paddw mm3, [xvid_FIR_6_20_20_7 + edx*8] + paddw mm0, [ecx + xvid_FIR_0_0_1_3 + edx*8 wrt ..gotoff] + paddw mm3, [ecx + xvid_FIR_6_20_20_7 + edx*8 wrt ..gotoff] - paddw mm0, [xvid_FIR_0_0_0_1 + eax*8] - paddw mm3, [xvid_FIR_3_6_19_23 + eax*8] + paddw mm0, [ecx + xvid_FIR_0_0_0_1 + eax*8 wrt ..gotoff] + paddw mm3, [ecx + xvid_FIR_3_6_19_23 + eax*8 wrt ..gotoff] %endif @@ -537,14 +546,14 @@ SECTION .text MIX mm0, esi+1, ebx %endif %if (%2==1) - MIX mm0, edi, Rounder1_MMX + MIX mm0, edi, ecx + Rounder1_MMX wrt ..gotoff %endif movq [edi], mm0 add edi, ebp add esi, ebp - dec ecx + dec dword [esp+20 + 2*4] jg .Loop %if (%2==0) && (%1==0) @@ -678,7 +687,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: V_MIX %3, esi, ebx %endif %if (%2==1) - V_MIX %3, edi, Rounder1_MMX + V_MIX %3, edi, ecx + Rounder1_MMX wrt ..gotoff %endif movd eax, %3 @@ -718,28 +727,28 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: movq mm3, mm7 V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_STORE %1, %2, mm0, 0 V_LOAD 0 - V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 - V_ACC1 mm3, FIR_Cm6 + V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff V_STORE %1, %2, mm1, 0 V_LOAD 0 - V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 + V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_STORE %1, %2, mm2, 0 V_LOAD 1 - V_ACC1 mm3, FIR_Cm1 + V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff V_STORE %1, %2, mm3, 0 ; ouput rows [4..7], from input rows [1..11] (!!) @@ -756,38 +765,38 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: movq mm3, mm7 V_LOAD 0 - V_ACC1 mm0, FIR_Cm1 + V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 + V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 - V_ACC1 mm2, FIR_Cm1 + V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_STORE %1, %2, mm0, 0 V_LOAD 0 - V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 - V_ACC1 mm3, FIR_Cm6 + V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff V_STORE %1, %2, mm1, 0 V_LOAD 0 - V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 + V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_STORE %1, %2, mm2, 0 V_LOAD 1 - V_ACC1 mm3, FIR_Cm1 + V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff V_STORE %1, %2, mm3, 0 ; ouput rows [8..11], from input rows [5..15] @@ -804,39 +813,39 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: movq mm3, mm7 V_LOAD 0 - V_ACC1 mm0, FIR_Cm1 + V_ACC1 mm0, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC2l mm0, mm1, FIR_C3, FIR_Cm1 + V_ACC2l mm0, mm1, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC2 mm0, mm1, FIR_Cm6, FIR_C3 - V_ACC1 mm2, FIR_Cm1 + V_ACC2 mm0, mm1, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm2, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_Cm6, FIR_C3, FIR_Cm1 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C20, FIR_C20, FIR_Cm6, FIR_C3 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm6, FIR_C20, FIR_C20, FIR_Cm6 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_STORE %1, %2, mm0, 0 V_LOAD 0 - V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 - V_ACC1 mm3, FIR_Cm6 + V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff V_STORE %1, %2, mm1, 0 V_LOAD 0 - V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 + V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_STORE %1, %2, mm2, 0 V_LOAD 1 - V_ACC1 mm3, FIR_Cm1 + V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff V_STORE %1, %2, mm3, 0 @@ -855,25 +864,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: movq mm3, mm7 V_LOAD 0 - V_ACC1 mm3, FIR_Cm1 + V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 + V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 0 - V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 - V_ACC1 mm3, FIR_Cm6 + V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 1 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_STORE %1, %2, mm3, 0 V_STORE %1, %2, mm2, 0 @@ -886,7 +895,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: pop edi add esi, 4 add edi, 4 - sub ecx, 4 + sub dword [esp+20 + 2*4], 4 jg .Loop %if (%2==0) && (%1==0) @@ -924,29 +933,29 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: movq mm3, mm7 V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_STORE %1, %2, mm0, 0 V_LOAD 0 - V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 - V_ACC1 mm3, FIR_Cm6 + V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff V_STORE %1, %2, mm1, 0 V_LOAD 0 - V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 + V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_STORE %1, %2, mm2, 0 V_LOAD 1 - V_ACC1 mm3, FIR_Cm1 + V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff V_STORE %1, %2, mm3, 0 ; ouput rows [4..7], from input rows [1..9] @@ -964,25 +973,25 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: movq mm3, mm7 V_LOAD 0 - V_ACC1 mm3, FIR_Cm1 + V_ACC1 mm3, ecx + FIR_Cm1 wrt ..gotoff V_LOAD 0 - V_ACC2l mm2, mm3, FIR_Cm1, FIR_C3 + V_ACC2l mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 0 - V_ACC2 mm1, mm2, FIR_Cm1, FIR_C3 - V_ACC1 mm3, FIR_Cm6 + V_ACC2 mm1, mm2, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff + V_ACC1 mm3, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm1, FIR_C3, FIR_Cm6, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm1 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C3, FIR_Cm6, FIR_C20, FIR_C20 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C3 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_Cm7, FIR_C20, FIR_C20, FIR_Cm6 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_Cm7 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_C20 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff V_LOAD 0 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C23, FIR_C19, FIR_Cm6, FIR_C3 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C23 wrt ..gotoff, ecx + FIR_C19 wrt ..gotoff, ecx + FIR_Cm6 wrt ..gotoff, ecx + FIR_C3 wrt ..gotoff V_LOAD 1 - V_ACC4 mm0, mm1, mm2, mm3, FIR_C14, FIR_Cm3, FIR_C2, FIR_Cm1 + V_ACC4 mm0, mm1, mm2, mm3, ecx + FIR_C14 wrt ..gotoff, ecx + FIR_Cm3 wrt ..gotoff, ecx + FIR_C2 wrt ..gotoff, ecx + FIR_Cm1 wrt ..gotoff V_STORE %1, %2, mm3, 0 V_STORE %1, %2, mm2, 0 @@ -995,7 +1004,7 @@ xvid_H_Pass_Avrg_Up_8_Add_mmx: pop edi add esi, 4 add edi, 4 - sub ecx, 4 + sub dword [esp+20 + 2*4], 4 jg .Loop %if (%2==0) && (%1==0) diff -urp xvidcore-1.1.0-old/src/image/x86_asm/reduced_mmx.asm xvidcore-1.1.0/src/image/x86_asm/reduced_mmx.asm --- xvidcore-1.1.0-old/src/image/x86_asm/reduced_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/image/x86_asm/reduced_mmx.asm 2006-02-19 01:49:09.000000000 +0100 @@ -91,8 +91,8 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm pmullw mm4, %4 ; [Up31] pmullw %2, %3 ; [Up13] pmullw mm5, %4 ; [Up31] - paddsw %1, [Cst2] - paddsw %2, [Cst2] + paddsw %1, [ebp + Cst2 wrt ..gotoff] + paddsw %2, [ebp + Cst2 wrt ..gotoff] paddsw %1, mm4 paddsw %2, mm5 %endmacro @@ -126,14 +126,14 @@ cglobal xvid_Filter_Diff_18x18_To_8x8_mm %macro MIX_ROWS 4 ; %1/%2:prev %3/4:cur (preserved) mm4/mm5: output ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved. - movq mm4, [Cst3] - movq mm5, [Cst3] + movq mm4, [ebp + Cst3 wrt ..gotoff] + movq mm5, [ebp + Cst3 wrt ..gotoff] pmullw mm4, %3 pmullw mm5, %4 paddsw mm4, %1 paddsw mm5, %2 - pmullw %1, [Cst3] - pmullw %2, [Cst3] + pmullw %1, [ebp + Cst3 wrt ..gotoff] + pmullw %2, [ebp + Cst3 wrt ..gotoff] paddsw %1, %3 paddsw %2, %4 %endmacro @@ -176,8 +176,12 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS - movq mm6, [Up13] - movq mm7, [Up31] + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm6, [ebp + Up13 wrt ..gotoff] + movq mm7, [ebp + Up31 wrt ..gotoff] COL03 mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 @@ -223,7 +227,7 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 STORE_1 mm2, mm3 - mov ecx, [esp+4] + mov ecx, [esp+8] add ecx, 8 COL47 mm0, mm1, 0 @@ -270,6 +274,7 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 STORE_1 mm2, mm3 + pop ebp ret .endfunc @@ -292,8 +297,8 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 ; (x*4 + 2)/4 = x - (x<0) ; So, better revert to (x*4)/4 = x. - psubsw %1, [Cst2000] - psubsw %2, [Cst0002] + psubsw %1, [ebp + Cst2000 wrt ..gotoff] + psubsw %2, [ebp + Cst0002 wrt ..gotoff] pxor mm6, mm6 pxor mm7, mm7 pcmpgtw mm6, %1 @@ -308,8 +313,8 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 ; mix with destination [ecx] movq mm6, [ecx] movq mm7, [ecx] - punpcklbw mm6, [Cst0] - punpckhbw mm7, [Cst0] + punpcklbw mm6, [ebp + Cst0 wrt ..gotoff] + punpckhbw mm7, [ebp + Cst0 wrt ..gotoff] paddsw %1, mm6 paddsw %2, mm7 packuswb %1,%2 @@ -342,16 +347,16 @@ xvid_Copy_Upsampled_8x8_16To8_mmx: ; 34 ; mix with destination movq mm6, [ecx] movq mm7, [ecx] - punpcklbw mm6, [Cst0] - punpckhbw mm7, [Cst0] + punpcklbw mm6, [ebp + Cst0 wrt ..gotoff] + punpckhbw mm7, [ebp + Cst0 wrt ..gotoff] paddsw %1, mm6 paddsw %2, mm7 movq mm6, [ecx+eax] movq mm7, [ecx+eax] - punpcklbw mm6, [Cst0] - punpckhbw mm7, [Cst0] + punpcklbw mm6, [ebp + Cst0 wrt ..gotoff] + punpckhbw mm7, [ebp + Cst0 wrt ..gotoff] paddsw mm4, mm6 paddsw mm5, mm7 @@ -373,98 +378,103 @@ xvid_Add_Upsampled_8x8_16To8_mmx: ; 579 mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + COL03 mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL03 mm2, mm3, 1 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 2 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 3 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 4 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 5 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 6 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 7 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 - mov ecx, [esp+4] + mov ecx, [esp+8] add ecx, 8 COL47 mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL47 mm2, mm3, 1 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 2 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 3 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 4 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 5 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 6 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 7 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 + pop ebp ret .endfunc @@ -503,8 +513,12 @@ xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS - movq mm6, [Up13] - movq mm7, [Up31] + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm6, [ebp + Up13 wrt ..gotoff] + movq mm7, [ebp + Up31 wrt ..gotoff] COL03_SSE mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 @@ -550,7 +564,7 @@ xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 STORE_1 mm2, mm3 - mov ecx, [esp+4] + mov ecx, [esp+8] add ecx, 8 COL47_SSE mm0, mm1, 0 @@ -597,6 +611,7 @@ xvid_Copy_Upsampled_8x8_16To8_xmm: ; 31 STORE_1 mm2, mm3 + pop ebp ret .endfunc @@ -614,98 +629,103 @@ xvid_Add_Upsampled_8x8_16To8_xmm: ; 549 mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + COL03_SSE mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL03_SSE mm2, mm3, 1 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03_SSE mm0, mm1, 2 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03_SSE mm2, mm3, 3 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03_SSE mm0, mm1, 4 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03_SSE mm2, mm3, 5 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03_SSE mm0, mm1, 6 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03_SSE mm2, mm3, 7 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 - mov ecx, [esp+4] + mov ecx, [esp+8] add ecx, 8 COL47_SSE mm0, mm1, 0 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL47_SSE mm2, mm3, 1 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47_SSE mm0, mm1, 2 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47_SSE mm2, mm3, 3 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47_SSE mm0, mm1, 4 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47_SSE mm2, mm3, 5 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47_SSE mm0, mm1, 6 - MUL_PACK mm0,mm1, [Up13], [Up31] + MUL_PACK mm0,mm1, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47_SSE mm2, mm3, 7 - MUL_PACK mm2,mm3, [Up13], [Up31] + MUL_PACK mm2,mm3, [ebp + Up13 wrt ..gotoff], [ebp + Up31 wrt ..gotoff] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 + pop ebp ret .endfunc @@ -732,7 +752,10 @@ xvid_HFilter_31_mmx: mov edi, [esp+8 +8] ; Src2 mov eax, [esp+12 +8] ; Nb_Blks lea eax,[eax*2] - movq mm5, [Cst2] + push dword 0x00020002 + push dword 0x00020002 + movq mm5, [esp] ; Cst2 + add esp, byte 8 pxor mm7, mm7 lea esi, [esi+eax*4] @@ -848,7 +871,7 @@ xvid_HFilter_31_x86: ;////////////////////////////////////////////////////////////////////// %macro HFILTER_1331 2 ;%1:src %2:dst reg. -trashes mm0/mm1/mm2 - movq mm2, [Mask_ff] + movq mm2, [ebp + Mask_ff wrt ..gotoff] movq %2, [%1-1] ;-10123456 movq mm0, [%1] ; 01234567 movq mm1, [%1+1] ; 12345678 @@ -863,7 +886,7 @@ xvid_HFilter_31_x86: %endmacro %macro VFILTER_1331 4 ; %1-4: regs %1-%2: trashed - paddsw %1, [Cst32] + paddsw %1, [ebp + Cst32 wrt ..gotoff] paddsw %2, %3 pmullw %2, mm7 paddsw %1,%4 @@ -899,7 +922,11 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS - movq mm7, [Cst3] + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm7, [ebp + Cst3 wrt ..gotoff] sub edx, eax ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. @@ -917,7 +944,7 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c ; process columns 4-7 - mov edx, [esp+8] + mov edx, [esp+12] sub edx, eax add edx, 8 @@ -930,6 +957,7 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c COPY_TWO_LINES_1331 ecx + 4*16 +8 COPY_TWO_LINES_1331 ecx + 6*16 +8 + pop ebp ret .endfunc @@ -958,6 +986,11 @@ xvid_Filter_18x18_To_8x8_mmx: ; 283c movq [%1+16], mm2 %endmacro +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + align 16 xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c @@ -965,7 +998,11 @@ xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS - movq mm7, [Cst3] + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm7, [ebp + Cst3 wrt ..gotoff] sub edx, eax ; mm3/mm4/mm5/mm6 is used as a 4-samples delay line. @@ -982,7 +1019,7 @@ xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 DIFF_TWO_LINES_1331 ecx + 6*16 ; process columns 4-7 - mov edx, [esp+8] + mov edx, [esp+12] sub edx, eax add edx, 8 @@ -995,6 +1032,7 @@ xvid_Filter_Diff_18x18_To_8x8_mmx: ; 30 DIFF_TWO_LINES_1331 ecx + 4*16 +8 DIFF_TWO_LINES_1331 ecx + 6*16 +8 + pop ebp ret .endfunc diff -urp xvidcore-1.1.0-old/src/motion/x86_asm/sad_3dn.asm xvidcore-1.1.0/src/motion/x86_asm/sad_3dn.asm --- xvidcore-1.1.0-old/src/motion/x86_asm/sad_3dn.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/motion/x86_asm/sad_3dn.asm 2006-02-19 01:49:09.000000000 +0100 @@ -44,20 +44,6 @@ BITS 32 %endmacro ;============================================================================= -; Read only data -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 -mmx_one: - times 4 dw 1 - -;============================================================================= ; Helper macros ;============================================================================= %macro SADBI_16x16_3DN 0 @@ -179,7 +165,10 @@ sad16bi_3dn: SADBI_16x16_3DN SADBI_16x16_3DN - pmaddwd mm6, [mmx_one] ; collapse + push dword 0x00010001 + push dword 0x00010001 + pmaddwd mm6, [esp] ; collapse + add esp, byte 8 movq mm7, mm6 psrlq mm7, 32 paddd mm6, mm7 @@ -216,7 +205,10 @@ sad8bi_3dn: SADBI_8x8_3DN SADBI_8x8_3DN - pmaddwd mm6, [mmx_one] ; collapse + push dword 0x00010001 + push dword 0x00010001 + pmaddwd mm6, [esp] ; collapse + add esp, byte 8 movq mm7, mm6 psrlq mm7, 32 paddd mm6, mm7 diff -urp xvidcore-1.1.0-old/src/motion/x86_asm/sad_mmx.asm xvidcore-1.1.0/src/motion/x86_asm/sad_mmx.asm --- xvidcore-1.1.0-old/src/motion/x86_asm/sad_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/motion/x86_asm/sad_mmx.asm 2006-02-19 01:49:09.000000000 +0100 @@ -45,20 +45,6 @@ BITS 32 %endmacro ;============================================================================= -; Read only data -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 -mmx_one: - times 4 dw 1 - -;============================================================================= ; Helper macros ;============================================================================= @@ -181,8 +167,8 @@ mmx_one: paddusw mm0, mm2 ; mm01 = ref1 + ref2 paddusw mm1, mm3 - paddusw mm0, [mmx_one] ; mm01 += 1 - paddusw mm1, [mmx_one] + paddusw mm0, [esp] ; mm01 += 1 + paddusw mm1, [esp] psrlw mm0, 1 ; mm01 >>= 1 psrlw mm1, 1 @@ -314,7 +300,7 @@ sad16_mmx: SAD_16x16_MMX SAD_16x16_MMX - pmaddwd mm6, [mmx_one] ; collapse + pmaddwd mm6, [esp] ; collapse movq mm7, mm6 psrlq mm7, 32 paddd mm6, mm7 @@ -339,6 +325,9 @@ sad8_mmx: mov edx, [esp+ 8] ; Src2 mov ecx, [esp+12] ; Stride + push dword 0x00010001 + push dword 0x00010001 + pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero @@ -347,13 +336,13 @@ sad8_mmx: SAD_8x8_MMX SAD_8x8_MMX - pmaddwd mm6, [mmx_one] ; collapse + pmaddwd mm6, [esp] ; collapse movq mm7, mm6 psrlq mm7, 32 paddd mm6, mm7 movd eax, mm6 - + add esp, byte 8 ret .endfunc @@ -377,6 +366,9 @@ sad16v_mmx: mov ecx, [esp + 8 + 12] ; Stride mov ebx, [esp + 8 + 16] ; sad ptr + push dword 0x00010001 + push dword 0x00010001 + pxor mm5, mm5 ; accum pxor mm6, mm6 ; accum pxor mm7, mm7 ; zero @@ -390,8 +382,8 @@ sad16v_mmx: SADV_16x16_MMX SADV_16x16_MMX - pmaddwd mm5, [mmx_one] ; collapse - pmaddwd mm6, [mmx_one] ; collapse + pmaddwd mm5, [esp] ; collapse + pmaddwd mm6, [esp] ; collapse movq mm2, mm5 movq mm3, mm6 @@ -421,8 +413,8 @@ sad16v_mmx: SADV_16x16_MMX SADV_16x16_MMX - pmaddwd mm5, [mmx_one] ; collapse - pmaddwd mm6, [mmx_one] ; collapse + pmaddwd mm5, [esp] ; collapse + pmaddwd mm6, [esp] ; collapse movq mm2, mm5 movq mm3, mm6 @@ -442,6 +434,7 @@ sad16v_mmx: add eax, edi + add esp, byte 8 pop edi pop ebx @@ -465,6 +458,9 @@ sad16bi_mmx: mov ebx, [esp+4+12] ; Ref2 mov ecx, [esp+4+16] ; Stride + push dword 0x00010001 + push dword 0x00010001 + pxor mm6, mm6 ; accum2 pxor mm7, mm7 .Loop @@ -502,12 +498,13 @@ sad16bi_mmx: SADBI_16x16_MMX 0, 0 SADBI_16x16_MMX 8, 1 - pmaddwd mm6, [mmx_one] ; collapse + pmaddwd mm6, [esp] ; collapse movq mm7, mm6 psrlq mm7, 32 paddd mm6, mm7 movd eax, mm6 + add esp, byte 8 pop ebx ret @@ -530,6 +527,9 @@ sad8bi_mmx: mov ebx, [esp+4+12] ; Ref2 mov ecx, [esp+4+16] ; Stride + push dword 0x00010001 + push dword 0x00010001 + pxor mm6, mm6 ; accum2 pxor mm7, mm7 .Loop @@ -542,12 +542,13 @@ sad8bi_mmx: SADBI_16x16_MMX 0, 1 SADBI_16x16_MMX 0, 1 - pmaddwd mm6, [mmx_one] ; collapse + pmaddwd mm6, [esp] ; collapse movq mm7, mm6 psrlq mm7, 32 paddd mm6, mm7 movd eax, mm6 + add esp, byte 8 pop ebx ret .endfunc @@ -568,6 +569,9 @@ dev16_mmx: pxor mm5, mm5 ; accum1 pxor mm6, mm6 ; accum2 + push dword 0x00010001 + push dword 0x00010001 + MEAN_16x16_MMX MEAN_16x16_MMX MEAN_16x16_MMX @@ -587,7 +591,7 @@ dev16_mmx: MEAN_16x16_MMX paddusw mm6, mm5 - pmaddwd mm6, [mmx_one] ; collapse + pmaddwd mm6, [esp] ; collapse movq mm5, mm6 psrlq mm5, 32 paddd mm6, mm5 @@ -622,13 +626,14 @@ dev16_mmx: ABS_16x16_MMX ABS_16x16_MMX - pmaddwd mm5, [mmx_one] ; collapse + pmaddwd mm5, [esp] ; collapse movq mm6, mm5 psrlq mm6, 32 paddd mm6, mm5 movd eax, mm6 + add esp, byte 8 ret .endfunc diff -urp xvidcore-1.1.0-old/src/quant/x86_asm/quantize_h263_3dne.asm xvidcore-1.1.0/src/quant/x86_asm/quantize_h263_3dne.asm --- xvidcore-1.1.0-old/src/quant/x86_asm/quantize_h263_3dne.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/quant/x86_asm/quantize_h263_3dne.asm 2006-02-19 01:49:09.000000000 +0100 @@ -233,7 +233,8 @@ ALIGN 8 movq mm3, [ebx] ;B2 %endif %if (%1 == 3) - imul eax, [int_div+4*edi] + mov esi, [esp + 4] + imul eax, [esi + int_div+4*edi wrt ..gotoff] %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 @@ -313,7 +314,8 @@ ALIGN 8 %endif nop %if (%1 == 3) - imul eax, [int_div+4*edi] + mov esi, [esp +4] + imul eax, [esi + int_div+4*edi wrt ..gotoff] %endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7 @@ -327,21 +329,25 @@ quant_h263_intra_3dne: mov eax, [esp + 12] ; quant mov ecx, [esp + 8] ; data mov edx, [esp + 4] ; coeff + push esi + push ebx + push edi + call get_pc.si + add esi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + push esi + cmp al, 1 pxor mm1, mm1 pxor mm3, mm3 movq mm0, [ecx] ; mm0 = [1st] movq mm2, [ecx + 8] - push esi - lea esi, [mmx_div + eax*8 - 8] + mov ebx, [esi + mmzero wrt ..gotoff] + lea esi, [esi + mmx_div + eax*8 - 8 wrt ..gotoff] - push ebx - mov ebx, mmzero - push edi jz near .q1loop quant_intra 0 - mov ebp, [esp + 16 + 16] ; dcscalar + mov ebp, [esp + 20 + 16] ; dcscalar ; NB -- there are 3 pushes in the function preambule and one more ; in "quant_intra 0", thus an added offset of 16 bytes movsx eax, word [byte ecx] ; DC @@ -354,20 +360,20 @@ quant_h263_intra_3dne: quant_intra 2 sub eax, edi ; DC (+1) xor ebp, edi ; sign(DC) dcscalar /2 (-1) - mov edi, [esp + 16 + 16] ; dscalar + mov edi, [esp + 20 + 16] ; dscalar lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 mov ebp, [byte esp] quant_intra 3 psubw mm5, mm4 ;C8 - mov esi, [esp + 12] ; pop back the register value - mov edi, [esp + 4] ; pop back the register value + mov esi, [esp + 16] ; pop back the register value + mov edi, [esp + 8] ; pop back the register value sar eax, 16 lea ebx, [byte eax + 1] ; workaround for eax < 0 cmovs eax, ebx ; conditionnaly move the corrected value mov [edx], ax ; coeff[0] = ax - mov ebx, [esp + 8] ; pop back the register value - add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 + mov ebx, [esp + 12] ; pop back the register value + add esp, byte 20 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 @@ -379,7 +385,7 @@ ALIGN 16 .q1loop quant_intra1 0 - mov ebp, [esp + 16 + 16] ; dcscalar + mov ebp, [esp + 20 + 16] ; dcscalar movsx eax, word [byte ecx] ; DC quant_intra1 1 @@ -390,20 +396,20 @@ ALIGN 16 quant_intra1 2 sub eax, edi ; DC (+1) xor ebp, edi ; sign(DC) dcscalar /2 (-1) - mov edi, [esp + 16 + 16] ; dcscalar + mov edi, [esp + 20 + 16] ; dcscalar lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2 mov ebp, [byte esp] quant_intra1 3 psubw mm5, mm4 ;C8 - mov esi, [dword esp + 12] ; pop back the register value - mov edi, [esp + 4] ; pop back the register value + mov esi, [dword esp + 16] ; pop back the register value + mov edi, [esp + 8] ; pop back the register value sar eax, 16 lea ebx, [byte eax + 1] ; workaround for eax < 0 cmovs eax, ebx ; conditionnaly move the corrected value mov [edx], ax ; coeff[0] = ax - mov ebx, [esp + 8] ; pop back the register value - add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 + mov ebx, [esp + 12] ; pop back the register value + add esp, byte 20 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 @@ -505,13 +511,18 @@ quant_h263_inter_3dne: mov eax, [esp + 12] ; quant push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + push ebx + pxor mm5, mm5 ; sum nop - lea ebx,[mmx_sub + eax * 8 - 8] ; sub - movq mm7, [mmx_div + eax * 8 - 8] ; divider + movq mm7, [ebx + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider + lea ebx,[ebx + mmx_sub + eax * 8 - 8 wrt ..gotoff] ; sub cmp al, 1 - lea eax, [mmzero] + mov eax, [esp] + lea eax, [eax + mmzero wrt ..gotoff] jz near .q1loop cmp esp, esp ALIGN 8 @@ -535,14 +546,15 @@ ALIGN 8 pxor mm4, mm3 ;B9 psubw mm4, mm3 ;B10 movq [edx + 4*24+16], mm2 ;C11 - pop ebx movq [edx + 4*24+8], mm4 ;B11 - pmaddwd mm5, [plus_one] + pop ebx + pmaddwd mm5, [ebx + plus_one wrt ..gotoff] movq mm0, mm5 punpckhdq mm5, mm5 paddd mm0, mm5 movd eax, mm0 ; return sum + pop ebx ret ALIGN 16 @@ -558,7 +570,8 @@ ALIGN 16 quantinter1 6 quantinter1 7 - pmaddwd mm5, [plus_one] + pop ebx + pmaddwd mm5, [ebx + plus_one wrt ..gotoff] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 @@ -658,23 +671,29 @@ dequant_h263_intra_3dne: pxor mm2, mm2 push edi push ebx - lea edi, [mmx_mul + eax*8 - 8] ; 2*quant push ebp - mov ebx, mmx_2047 - movsx ebp, word [ecx] - lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 push esi - mov esi, mmzero + + call get_pc.di + add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + push edi + + lea edi, [edi + mmx_mul + eax*8 - 8 wrt ..gotoff] ; 2*quant + mov esi, [esp] + lea ebx, [esi + mmx_2047 wrt ..gotoff] + movsx ebp, word [ecx] + lea eax, [esi + mmx_add + eax*8 - 8 wrt ..gotoff] ; quant or quant-1 + lea esi, [esi + mmzero wrt ..gotoff] pxor mm7, mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) - imul ebp, [esp+16+16] ; dcscalar + imul ebp, [esp+16+20] ; dcscalar psubw mm2, mm3 ;-c ;B3 (1st dep) pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) - mov edx, [esp+ 4+16] ; data + mov edx, [esp+ 4+20] ; data ALIGN 8 dequant 0 @@ -684,7 +703,8 @@ ALIGN 8 dequant 1 - cmovl ebp, [int_2048] + mov ebp, [esp] + cmovl ebp, [ebp + int_2048 wrt ..gotoff] nop dequant 2 @@ -694,7 +714,8 @@ ALIGN 8 dequant 3 - cmovg ebp, [int2047] + mov ebp, [esp] + cmovg ebp, [ebp + int2047 wrt ..gotoff] nop dequant 4 @@ -703,16 +724,16 @@ ALIGN 8 pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) mov eax, ebp - mov esi, [esp] - mov ebp, [esp+4] + mov esi, [esp+4] + mov ebp, [esp+8] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) - mov ebx, [esp+8] - mov edi, [esp+12] - add esp, byte 16 + mov ebx, [esp+12] + mov edi, [esp+16] + add esp, byte 20 pxor mm3, mm2 ;B13 (6th+) movq [edx+4*24+8], mm3 ;B14 (7th) mov [edx], ax @@ -721,6 +742,20 @@ ALIGN 8 ret .endfunc +extern _GLOBAL_OFFSET_TABLE_ +get_pc +.bx: + mov ebx, [esp] + retn + +.si: + mov esi, [esp] + retn + +.di: + mov edi, [esp] + retn + ;----------------------------------------------------------------------------- ; ; uint32_t dequant_h263_inter_3dne(int16_t * data, @@ -744,18 +779,24 @@ dequant_h263_inter_3dne: push edi push ebx push esi - lea edi, [mmx_mul + eax*8 - 8] ; 2*quant - mov ebx, mmx_2047 + + call get_pc.di + add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + push edi + + mov ebx, [edi + mmx_2047 wrt ..gotoff] + lea edi, [edi + mmx_mul + eax*8 - 8 wrt ..gotoff] ; 2*quant pxor mm7, mm7 movq mm3, [ecx+120] ;B2 ; c = coeff[i] pcmpeqw mm7, [ecx+120] ;B6 (c ==0) ? -1 : 0 (1st) - lea eax, [mmx_add + eax*8 - 8] ; quant or quant-1 + mov esi, [esp] + lea eax, [esi + mmx_add + eax*8 - 8 wrt ..gotoff] ; quant or quant-1 psubw mm2, mm3 ;-c ;B3 (1st dep) - mov esi, mmzero + lea esi, [esi + mmzero wrt ..gotoff] pmaxsw mm2, mm3 ;|c| ;B4 (2nd) pmullw mm2, [edi] ;*= 2Q ;B8 (3rd+) psraw mm3, 15 ; sign(c) ;B7 (2nd) - mov edx, [dword esp+ 4+12] ; data + mov edx, [dword esp+ 4+16] ; data ALIGN 8 @@ -768,15 +809,15 @@ ALIGN 8 paddw mm4, mm6 ;C11 mm6 free (4th+) pminsw mm4, [ebx] ;C12 saturates to +2047 (5th+) pandn mm7, [eax] ;B9 offset = isZero ? 0 : quant_add (2nd) - mov esi, [esp] + mov esi, [esp+4] pxor mm5, mm4 ;C13 (6th+) paddw mm7, mm3 ;B10 offset +negate back (3rd) movq [edx+4*24+16], mm5 ;C14 (7th) paddw mm2, mm7 ;B11 mm7 free (4th+) pminsw mm2, [ebx] ;B12 saturates to +2047 (5th+) - mov ebx, [esp+4] - mov edi, [esp+8] - add esp, byte 12 + mov ebx, [esp+8] + mov edi, [esp+12] + add esp, byte 16 pxor mm3, mm2 ;B13 (6th+) movq [edx+4*24+8], mm3 ;B14 (7th) diff -urp xvidcore-1.1.0-old/src/quant/x86_asm/quantize_h263_mmx.asm xvidcore-1.1.0/src/quant/x86_asm/quantize_h263_mmx.asm --- xvidcore-1.1.0-old/src/quant/x86_asm/quantize_h263_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/quant/x86_asm/quantize_h263_mmx.asm 2006-02-19 01:49:09.000000000 +0100 @@ -139,9 +139,10 @@ ALIGN 16 quant_h263_intra_mmx: push esi + push edi - mov esi, [esp + 4 + 8] ; data - mov ecx,[esp + 4 + 16] ; dcscalar + mov esi, [esp + 8 + 8] ; data + mov ecx,[esp + 8 + 16] ; dcscalar movsx eax, word [esi] ; data[0] sar ecx,1 ; dcscalar /2 @@ -151,14 +152,17 @@ quant_h263_intra_mmx: sub eax,edx add eax,ecx ; + (dcscalar/2)*sgn(data[0]) - mov ecx, [esp + 4 + 12] ; quant + mov ecx, [esp + 8 + 12] ; quant cdq - idiv dword [esp + 4 + 16] ; dcscalar + idiv dword [esp + 8 + 16] ; dcscalar cmp ecx, 1 - mov edx, [esp + 4 + 4] ; coeff + mov edx, [esp + 8 + 4] ; coeff je .low - - movq mm7, [mmx_div+ecx * 8 - 8] + + call get_pc.di + add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm7, [edi + mmx_div+ecx * 8 - 8 wrt ..gotoff] mov ecx,4 .loop @@ -228,10 +232,11 @@ quant_h263_intra_mmx: jne .loop_low .end - mov edx, [esp + 4 + 4] ; coeff + mov edx, [esp + 8 + 4] ; coeff mov [edx],ax xor eax,eax ; return 0 + pop edi pop esi ret .endfunc @@ -251,23 +256,28 @@ ALIGN 16 quant_h263_intra_sse2: push esi + push edi - mov esi, [esp + 4 + 8] ; data + mov esi, [esp + 8 + 8] ; data movsx eax, word [esi] ; data[0] - mov ecx,[esp + 4 + 16] ; dcscalar + mov ecx,[esp + 8 + 16] ; dcscalar mov edx,eax sar ecx,1 add eax,ecx sub edx,ecx cmovl eax,edx ; +/- dcscalar/2 - mov ecx, [esp + 4 + 12] ; quant + mov ecx, [esp + 8 + 12] ; quant cdq - idiv dword [esp + 4 + 16] ; dcscalar + idiv dword [esp + 8 + 16] ; dcscalar cmp ecx, 1 - mov edx, [esp + 4 + 4] ; coeff - movq xmm7, [mmx_div+ecx * 8 - 8] + mov edx, [esp + 8 + 4] ; coeff + + call get_pc.di + add edi, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq xmm7, [edi + mmx_div+ecx * 8 - 8 wrt ..gotoff] je .low mov ecx,2 @@ -340,10 +350,11 @@ quant_h263_intra_sse2: jne .loop_low .end - mov edx, [esp + 4 + 4] ; coeff + mov edx, [esp + 8 + 4] ; coeff mov [edx],ax xor eax,eax ; return 0 + pop edi pop esi ret .endfunc @@ -370,13 +381,16 @@ quant_h263_inter_mmx: xor ecx, ecx + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + pxor mm5, mm5 ; sum - movq mm6, [mmx_sub + eax * 8 - 8] ; sub + movq mm6, [edx + mmx_sub + eax * 8 - 8 wrt ..gotoff] ; sub cmp al, 1 jz .q1loop - movq mm7, [mmx_div + eax * 8 - 8] ; divider + movq mm7, [edx + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider ALIGN 8 .loop @@ -408,7 +422,7 @@ ALIGN 8 jnz .loop .done - pmaddwd mm5, [plus_one] + pmaddwd mm5, [edx + plus_one wrt ..gotoff] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 @@ -477,7 +491,10 @@ quant_h263_inter_sse2: pxor xmm5, xmm5 ; sum - movq mm0, [mmx_sub + eax*8 - 8] ; sub + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm0, [edx + mmx_sub + eax*8 - 8 wrt ..gotoff] ; sub movq2dq xmm6, mm0 ; load into low 8 bytes movlhps xmm6, xmm6 ; duplicate into high 8 bytes @@ -485,7 +502,7 @@ quant_h263_inter_sse2: jz near .qes2_q1loop .qes2_not1 - movq mm0, [mmx_div + eax*8 - 8] ; divider + movq mm0, [edx + mmx_div + eax*8 - 8 wrt ..gotoff] ; divider movq2dq xmm7, mm0 movlhps xmm7, xmm7 @@ -519,7 +536,7 @@ ALIGN 16 jnz .qes2_loop .qes2_done - movdqu xmm6, [plus_one] + movdqu xmm6, [edx + plus_one wrt ..gotoff] pmaddwd xmm5, xmm6 movhlps xmm6, xmm5 paddd xmm5, xmm6 @@ -583,8 +600,12 @@ dequant_h263_intra_mmx: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff + + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + pcmpeqw mm0,mm0 - movq mm6, [mmx_quant + ecx*8] ; quant + movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant shl ecx,31 ; quant & 1 ? 0 : - 1 movq mm7,mm6 movq mm5,mm0 @@ -841,8 +862,12 @@ dequant_h263_inter_mmx: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff + + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + pcmpeqw mm0,mm0 - movq mm6, [mmx_quant + ecx*8] ; quant + movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant shl ecx,31 ; odd/even movq mm7,mm6 movd mm1,ecx @@ -912,8 +937,12 @@ dequant_h263_inter_xmm: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff + + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + pcmpeqw mm0,mm0 - movq mm6, [mmx_quant + ecx*8] ; quant + movq mm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant shl ecx,31 movq mm5,mm0 movd mm1,ecx @@ -967,7 +996,16 @@ dequant_h263_inter_xmm: ret .endfunc - +extern _GLOBAL_OFFSET_TABLE_ +get_pc +.di: + mov edi, [esp] + retn + +.dx: + mov edx, [esp] + retn + ;----------------------------------------------------------------------------- ; ; uint32_t dequant_h263_inter_sse2(int16_t * data, @@ -983,7 +1021,10 @@ dequant_h263_inter_sse2: mov ecx, [esp+12] ; quant mov eax, [esp+ 8] ; coeff - movq xmm6, [mmx_quant + ecx*8] ; quant + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq xmm6, [edx + mmx_quant + ecx*8 wrt ..gotoff] ; quant inc ecx pcmpeqw xmm5,xmm5 and ecx,1 diff -urp xvidcore-1.1.0-old/src/quant/x86_asm/quantize_mpeg_mmx.asm xvidcore-1.1.0/src/quant/x86_asm/quantize_mpeg_mmx.asm --- xvidcore-1.1.0-old/src/quant/x86_asm/quantize_mpeg_mmx.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/quant/x86_asm/quantize_mpeg_mmx.asm 2006-02-19 01:51:27.000000000 +0100 @@ -162,7 +162,11 @@ quant_mpeg_intra_mmx: mov eax, [esp + 16 + 12] ; quant mov ebx, [esp + 16 + 20] ; mpeg_quant_matrices - movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5 + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm5, [ebp + quantd + eax * 8 - 8 wrt ..gotoff] ; quantd -> mm5 xor ecx, ecx cmp al, 1 @@ -171,7 +175,7 @@ quant_mpeg_intra_mmx: cmp al, 2 jz near .q2loop - movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7 + movq mm7, [ebp + mmx_div + eax * 8 - 8 wrt ..gotoff] ; multipliers[quant] -> mm7 ALIGN 16 .loop @@ -234,6 +238,7 @@ ALIGN 16 mov [edi], ax ; coeff[0] = ax + pop ebp pop ebx pop edi pop esi @@ -346,6 +351,10 @@ quant_mpeg_inter_mmx: mov eax, [esp + 16 + 12] ; quant mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + xor ecx, ecx pxor mm5, mm5 ; sum @@ -356,7 +365,7 @@ quant_mpeg_inter_mmx: cmp al, 2 jz near .q2loop - movq mm7, [mmx_div + eax * 8 - 8] ; divider + movq mm7, [ebp + mmx_div + eax * 8 - 8 wrt ..gotoff] ; divider ALIGN 16 .loop @@ -400,17 +409,17 @@ ALIGN 16 jnz near .loop .done - pmaddwd mm5, [mmx_one] + pmaddwd mm5, [ebp + mmx_one wrt ..gotoff] movq mm0, mm5 psrlq mm5, 32 paddd mm0, mm5 movd eax, mm0 ; return sum + pop ebp pop ebx pop edi pop esi pop ecx - ret ALIGN 16 @@ -556,7 +565,11 @@ dequant_mpeg_intra_mmx: mov eax, [esp + 4 + 12] ; quant mov ebx, [esp + 4 + 20] ; mpeg_quant_matrices - movq mm7, [mmx_mul_quant + eax*8 - 8] + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm7, [ebp + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] mov eax, -16 ; to keep ALIGNed, we regularly process coeff[0] psllw mm7, 2 ; << 2. See comment. pxor mm6, mm6 ; this is a NOP @@ -595,11 +608,11 @@ ALIGN 16 pmullw mm6, mm3 ; low of coeff*(matrix*quant) pmulhw mm3, mm5 ; high of coeff*(matrix*quant) - pcmpgtw mm0, [zero] + pcmpgtw mm0, [ebp + zero wrt ..gotoff] paddusw mm2, mm0 psrlw mm2, 5 - pcmpgtw mm3, [zero] + pcmpgtw mm3, [ebp + zero wrt ..gotoff] paddusw mm6, mm3 psrlw mm6, 5 @@ -620,22 +633,28 @@ ALIGN 16 ; deal with DC movd mm0, [ecx] pmullw mm0, [esp + 4 + 16] ; dcscalar - movq mm2, [mmx_32767_minus_2047] + movq mm2, [ebp + mmx_32767_minus_2047 wrt ..gotoff] paddsw mm0, mm2 psubsw mm0, mm2 - movq mm2, [mmx_32768_minus_2048] + movq mm2, [ebp + mmx_32768_minus_2048 wrt ..gotoff] psubsw mm0, mm2 paddsw mm0, mm2 movd eax, mm0 mov [edx], ax xor eax, eax - + + pop ebp pop ebx ret .endfunc +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + ;----------------------------------------------------------------------------- ; ; uint32_t dequant_mpeg_inter_mmx(int16_t * data, @@ -660,7 +679,11 @@ dequant_mpeg_inter_mmx: mov eax, [esp + 4 + 12] ; quant mov ebx, [esp + 4 + 16] ; mpeg_quant_matrices - movq mm7, [mmx_mul_quant + eax*8 - 8] + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm7, [ebp + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] mov eax, -16 paddw mm7, mm7 ; << 1 pxor mm6, mm6 ; mismatch sum @@ -702,7 +725,7 @@ ALIGN 16 movq mm4, mm7 ; (matrix*quant) pmullw mm4, [ebx + 512 + 8*eax + 8*16 -2*8 + 8] - pcmpgtw mm5, [zero] + pcmpgtw mm5, [ebp + zero wrt ..gotoff] paddusw mm0, mm5 psrlw mm0, 5 pxor mm0, mm1 ; start restoring sign @@ -713,7 +736,7 @@ ALIGN 16 pmullw mm2, mm4 ; low of c*(matrix*quant) psubw mm0, mm1 ; finish restoring sign - pcmpgtw mm5, [zero] + pcmpgtw mm5, [ebp + zero wrt ..gotoff] paddusw mm2, mm5 psrlw mm2, 5 pxor mm2, mm3 ; start restoring sign @@ -744,7 +767,8 @@ ALIGN 16 xor word [edx + 2*63], ax xor eax, eax - + + pop ebp pop ebx ret diff -urp xvidcore-1.1.0-old/src/quant/x86_asm/quantize_mpeg_xmm.asm xvidcore-1.1.0/src/quant/x86_asm/quantize_mpeg_xmm.asm --- xvidcore-1.1.0-old/src/quant/x86_asm/quantize_mpeg_xmm.asm 2006-02-19 01:39:47.000000000 +0100 +++ xvidcore-1.1.0/src/quant/x86_asm/quantize_mpeg_xmm.asm 2006-02-19 01:49:09.000000000 +0100 @@ -188,8 +188,12 @@ quant_mpeg_intra_xmm: push esi push edi push ebx - nop - mov edi, [esp + 12 + 20] ; mpeg_quant_matrices + + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov edi, [esp + 16 + 20] ; mpeg_quant_matrices mov esi, -14 pxor mm0, mm0 pxor mm3, mm3 @@ -226,8 +230,8 @@ ALIGN 16 psubw mm0, mm2 psubw mm3, mm6 nop4 - movq mm2, [quantd + ecx * 8 - 8] - movq mm6, [mmx_divs + ecx * 8 - 8] + movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff] + movq mm6, [ebp + mmx_divs + ecx * 8 - 8 wrt ..gotoff] paddw mm5, mm2 paddw mm7, mm2 mov esp, esp @@ -250,27 +254,28 @@ ALIGN 16 .done ; calculate data[0] // (int32_t)dcscalar) - mov esi, [esp + 12 + 16] ; dcscalar + mov esi, [esp + 16 + 16] ; dcscalar movsx ecx, word [eax] mov edi, ecx - mov edx, [esp + 12 + 16] + mov edx, [esp + 16 + 16] shr edx, 1 ; ebx = dcscalar /2 sar edi, 31 ; cdq is vectorpath xor edx, edi ; ebx = eax V -eax -1 sub ecx, edi add ecx, edx - mov edx, [dword esp + 12 + 4] - mov esi, [int_div+4*esi] + mov edx, [dword esp + 16 + 4] + mov esi, [ebp + int_div+4*esi wrt ..gotoff] imul ecx, esi sar ecx, 17 lea ebx, [byte ecx + 1] cmovs ecx, ebx ; idiv cx ; ecx = edi:ecx / dcscalar - mov ebx, [esp] - mov edi, [esp+4] - mov esi, [esp+8] - add esp, byte 12 + mov ebp, [esp] + mov ebx, [esp+4] + mov edi, [esp+8] + mov esi, [esp+12] + add esp, byte 16 mov [edx], cx ; coeff[0] = ax xor eax, eax @@ -303,7 +308,7 @@ ALIGN 16 psubw mm0, mm2 ;mismatch psubw mm3, mm6 nop4 - movq mm2, [quantd + ecx * 8 - 8] + movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff] paddw mm5, mm2 ;first approx with quantd paddw mm7, mm2 mov esp, esp @@ -353,8 +358,8 @@ ALIGN 8 psubw mm0, mm2 ;mismatch psubw mm3, mm6 nop4 - movq mm2, [quantd + ecx * 8 - 8] - movq mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16 + movq mm2, [ebp + quantd + ecx * 8 - 8 wrt ..gotoff] + movq mm6, [ebp + mmx_div + ecx * 8 - 8 wrt ..gotoff] ; divs for q<=16 paddw mm5, mm2 ;first approx with quantd paddw mm7, mm2 mov esp, esp @@ -397,8 +402,12 @@ quant_mpeg_inter_xmm: push esi push edi push ebx - nop - mov edi, [esp + 12 + 16] + + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov edi, [esp + 16 + 16] mov esi, -14 mov ebx, esp sub esp, byte 24 @@ -440,8 +449,8 @@ ALIGN 16 pmullw mm6, mm7 psubw mm0, mm2 psubw mm3, mm6 - movq mm2, [byte ebx] - movq mm6, [mmx_divs + ecx * 8 - 8] + movq mm2, [ebp + ebx wrt ..gotoff] + movq mm6, [ebp + mmx_divs + ecx * 8 - 8 wrt ..gotoff] pmulhuw mm0, [edi + 768 + 8*esi+112] pmulhuw mm3, [edi + 768 + 8*esi+120] paddw mm2, [ebx+8] ;sum @@ -466,11 +475,12 @@ ALIGN 16 .done ; calculate data[0] // (int32_t)dcscalar) paddw mm2, [ebx] - mov ebx, [esp+24] - mov edi, [esp+4+24] - mov esi, [esp+8+24] - add esp, byte 12+24 - pmaddwd mm2, [mmx_one] + mov ebx, [esp+4+24] + mov edi, [esp+8+24] + mov esi, [esp+12+24] + pmaddwd mm2, [ebp + mmx_one wrt ..gotoff] + mov ebp, [esp+24] + add esp, byte 16+24 punpckldq mm0, mm2 ;get low dw to mm0:high paddd mm0,mm2 punpckhdq mm0, mm0 ;get result to low @@ -554,7 +564,7 @@ ALIGN 8 psubw mm0,mm2 ;mismatch psubw mm3,mm6 movq mm2,[byte ebx] - movq mm6,[mmx_div + ecx * 8 - 8] ; divs for q<=16 + movq mm6,[ebp + mmx_div + ecx * 8 - 8 wrt ..gotoff] ; divs for q<=16 pmulhuw mm0,[edi + 768 + 8*esi+112] ;correction pmulhuw mm3,[edi + 768 + 8*esi+120] paddw mm2,[ebx+8] ;sum @@ -644,7 +654,11 @@ ALIGN 16 dequant_mpeg_intra_3dne: mov eax, [esp+12] ; quant mov ecx, [esp+8] ; coeff - movq mm7, [mmx_mul_quant + eax*8 - 8] + + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm7, [edx + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] psllw mm7, 2 ; << 2. See comment. mov edx, [esp+4] ; data push ebx @@ -700,6 +714,16 @@ ALIGN 4 ret .endfunc +extern _GLOBAL_OFFSET_TABLE_ +get_pc +.dx: + mov edx, [esp] + retn + +.bp: + mov ebp, [esp] + retn + ;----------------------------------------------------------------------------- ; ; uint32_t dequant_mpeg_inter_3dne(int16_t * data, @@ -716,16 +740,20 @@ ALIGN 4 ALIGN 16 dequant_mpeg_inter_3dne: - mov edx, [esp+ 4] ; data mov ecx, [esp+ 8] ; coeff mov eax, [esp+12] ; quant - movq mm7, [mmx_mul_quant + eax*8 - 8] + + call get_pc.dx + add edx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + movq mm7, [edx + mmx_mul_quant + eax*8 - 8 wrt ..gotoff] mov eax, -14 paddw mm7, mm7 ; << 1 pxor mm6, mm6 ; mismatch sum push esi push edi - mov esi, mmzero + mov esi, [edx + mmzero wrt ..gotoff] + mov edx, [esp + 8 + 4] ; data pxor mm1, mm1 pxor mm3, mm3 mov edi, [esp + 8 + 16] ; mpeg_quant_matrices diff -urp xvidcore-1.1.0-old/src/utils/x86_asm/cpuid.asm xvidcore-1.1.0/src/utils/x86_asm/cpuid.asm --- xvidcore-1.1.0-old/src/utils/x86_asm/cpuid.asm 2006-02-19 01:39:48.000000000 +0100 +++ xvidcore-1.1.0/src/utils/x86_asm/cpuid.asm 2006-02-19 03:10:34.000000000 +0100 @@ -66,20 +66,6 @@ BITS 32 %define XVID_CPU_TSC (1<< 6) ;============================================================================= -; Read only data -;============================================================================= - -ALIGN 32 -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -vendorAMD: - db "AuthenticAMD" - -;============================================================================= ; Macros ;============================================================================= @@ -161,11 +147,11 @@ check_cpu_features: cpuid ; AMD cpu ? - lea esi, [vendorAMD] - lea edi, [esp] - mov ecx, 12 - cld - repe cmpsb + cmp dword [esp],"Auth" + jnz .cpu_quit + cmp dword [esp+4],"enti" + jnz .cpu_quit + cmp dword [esp+8],"cAMD" jnz .cpu_quit ; 3DNow! support ? @@ -208,7 +194,6 @@ sse2_os_trigger: ret .endfunc - ; enter/exit mmx state ALIGN 16 cglobal emms_mmx diff -urp xvidcore-1.1.0-old/src/utils/x86_asm/interlacing_mmx.asm xvidcore-1.1.0/src/utils/x86_asm/interlacing_mmx.asm --- xvidcore-1.1.0-old/src/utils/x86_asm/interlacing_mmx.asm 2006-02-19 01:39:48.000000000 +0100 +++ xvidcore-1.1.0/src/utils/x86_asm/interlacing_mmx.asm 2006-02-19 01:50:37.000000000 +0100 @@ -129,6 +129,11 @@ cglobal MBFieldTest_mmx paddw mm7, mm3 %endmacro +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bx: + mov ebx, [esp] + retn + ;----------------------------------------------------------------------------- ; ; uint32_t MBFieldTest_mmx(int16_t * const data); @@ -141,7 +146,11 @@ MBFieldTest_mmx: push esi push edi - mov esi, [esp+8+4] ; esi = top left block + push ebx + call get_pc.bx + add ebx, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov esi, [esp+12+4] ; esi = top left block mov edi, esi add edi, 256 ; edi = bottom left block @@ -184,7 +193,7 @@ MBFieldTest_mmx: psubw m14, mm4 paddw mm6, m14 ; add to frame total - mov ecx, [nexts+eax*4] ; move esi/edi 8 pixels to the right + mov ecx, [ebx+nexts+eax*4 wrt ..gotoff] ; move esi/edi 8 pixels to the right add esi, ecx add edi, ecx @@ -192,7 +201,7 @@ MBFieldTest_mmx: jnz near .loop .decide: - movq mm0, [ones] ; add packed words into single dwords + movq mm0, [ebx + ones wrt ..gotoff] ; add packed words into single dwords pmaddwd mm6, mm0 pmaddwd mm7, mm0 @@ -211,6 +220,7 @@ MBFieldTest_mmx: inc eax ; if frame>=field, use field dct (return 1) .end: + pop ebx pop edi pop esi diff -urp xvidcore-1.1.0-old/src/utils/x86_asm/mem_transfer_mmx.asm xvidcore-1.1.0/src/utils/x86_asm/mem_transfer_mmx.asm --- xvidcore-1.1.0-old/src/utils/x86_asm/mem_transfer_mmx.asm 2006-02-19 01:39:48.000000000 +0100 +++ xvidcore-1.1.0/src/utils/x86_asm/mem_transfer_mmx.asm 2006-02-19 01:49:09.000000000 +0100 @@ -46,20 +46,6 @@ BITS 32 %endmacro ;============================================================================= -; Read only data -;============================================================================= - -%ifdef FORMAT_COFF -SECTION .rodata -%else -SECTION .rodata align=16 -%endif - -ALIGN 16 -mmx_one: - dw 1, 1, 1, 1 - -;============================================================================= ; Code ;============================================================================= @@ -260,8 +246,8 @@ transfer_8to16subro_mmx: punpckhbw mm3, mm7 paddusw mm4, mm1 paddusw mm6, mm3 - paddusw mm4, [mmx_one] - paddusw mm6, [mmx_one] + paddusw mm4, [esp] + paddusw mm6, [esp] psrlw mm4, 1 psrlw mm6, 1 packuswb mm4, mm6 @@ -278,8 +264,8 @@ transfer_8to16subro_mmx: punpckhbw mm3, mm7 paddusw mm5, mm1 paddusw mm6, mm3 - paddusw mm5, [mmx_one] - paddusw mm6, [mmx_one] + paddusw mm5, [esp] + paddusw mm6, [esp] lea esi, [esi+2*edx] psrlw mm5, 1 psrlw mm6, 1 @@ -323,10 +309,13 @@ transfer_8to16sub2_mmx: mov edx, [esp+8+20] ; Stride pxor mm7, mm7 + push dword 0x00010001 + push dword 0x00010001 COPY_8_TO_16_SUB2_MMX 0 COPY_8_TO_16_SUB2_MMX 1 COPY_8_TO_16_SUB2_MMX 2 COPY_8_TO_16_SUB2_MMX 3 + add esp, byte 8 pop esi pop ebx