|
|
S_SQRT2 dd 1.414213562 | S_SQRT2 dd 1.414213562 |
| |
segment_code | segment_code |
|
|
|
extern _GLOBAL_OFFSET_TABLE_ |
|
get_pc.bp: |
|
mov ebp, [esp] |
|
retn |
|
|
;------------------------------------------------------------------------ | ;------------------------------------------------------------------------ |
; by K. SAKAI | ; by K. SAKAI |
; 99/08/18 PIII 23k[clk] | ; 99/08/18 PIII 23k[clk] |
|
|
push esi | push esi |
push edi | push edi |
push ebp | push ebp |
%assign _P 4*4 |
|
|
%assign _P 4*5 |
| |
;2つ目のループ | ;2つ目のループ |
mov eax,[esp+_P+4] ;eax=fz | mov eax,[esp+_P+4] ;eax=fz |
mov ebp,[esp+_P+8] ;=n | mov ebp,[esp+_P+8] ;=n |
shl ebp,3 | shl ebp,3 |
add ebp,eax ; fn = fz + n, この関数終了まで不変 | add ebp,eax ; fn = fz + n, この関数終了まで不変 |
|
push ebp |
|
|
|
call get_pc.bp |
|
add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc |
| |
lea ecx,[costab_fft] |
lea ecx,[ebp + costab_fft wrt ..gotoff] |
xor eax,eax | xor eax,eax |
mov al,8 ; =k1=1*(sizeof float) // 4, 16, 64, 256,... | mov al,8 ; =k1=1*(sizeof float) // 4, 16, 64, 256,... |
.lp2: ; do{ | .lp2: ; do{ |
|
|
; gi[k3] = g1 - g3; | ; gi[k3] = g1 - g3; |
fld dword [edi] | fld dword [edi] |
fadd dword [edi+eax*2] | fadd dword [edi+eax*2] |
fld dword [S_SQRT2] |
fld dword [ebp + S_SQRT2 wrt ..gotoff] |
fmul dword [edi+eax*4] | fmul dword [edi+eax*4] |
| |
fld dword [edi] | fld dword [edi] |
fsub dword [edi+eax*2] | fsub dword [edi+eax*2] |
fld dword [S_SQRT2] |
fld dword [ebp + S_SQRT2 wrt ..gotoff] |
fmul dword [edi+edx*2] | fmul dword [edi+edx*2] |
| |
fld st1 | fld st1 |
|
|
fsubp st1,st0 | fsubp st1,st0 |
fstp dword [edi+eax*4] | fstp dword [edi+eax*4] |
| |
cmp ebx,ebp |
cmp ebx,[esp] |
jl near .lp20 ; while (fi<fn); | jl near .lp20 ; while (fi<fn); |
| |
| |
|
|
; s2 = c1*s1 + s1*c1 = 2*s1*c1; | ; s2 = c1*s1 + s1*c1 = 2*s1*c1; |
shufps xmm7,xmm7,R4(1,0,0,1) | shufps xmm7,xmm7,R4(1,0,0,1) |
movss xmm5,xmm7 ; = { --, --, --, s1} | movss xmm5,xmm7 ; = { --, --, --, s1} |
xorps xmm7,[Q_MMPP] ; = {-s1, -c1, +c1, +s1} -> 必要 |
xorps xmm7,[ebp + Q_MMPP wrt ..gotoff] ; = {-s1, -c1, +c1, +s1} -> 必要 |
| |
addss xmm5,xmm5 ; = (--, --, --, 2*s1) | addss xmm5,xmm5 ; = (--, --, --, 2*s1) |
add esi,4 ; esi = fi = fz + i | add esi,4 ; esi = fi = fz + i |
shufps xmm5,xmm5,R4(0,0,0,0) ; = (2*s1, 2*s1, 2*s1, 2*s1) | shufps xmm5,xmm5,R4(0,0,0,0) ; = (2*s1, 2*s1, 2*s1, 2*s1) |
mulps xmm5,xmm6 ; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1) | mulps xmm5,xmm6 ; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1) |
subps xmm5,[D_1100] ; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2} |
subps xmm5,[ebp + D_1100 wrt ..gotoff] ; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2} |
movaps xmm4,xmm5 | movaps xmm4,xmm5 |
shufps xmm5,xmm5,R4(2,0,2,0) ; = {-c2, s2, -c2, s2} -> 必要 | shufps xmm5,xmm5,R4(2,0,2,0) ; = {-c2, s2, -c2, s2} -> 必要 |
| |
xorps xmm4,[Q_MMPP] ; = {--, c2, --, s2} |
xorps xmm4,[ebp + Q_MMPP wrt ..gotoff] ; = {--, c2, --, s2} |
shufps xmm4,xmm4,R4(0,2,0,2) ; = {s2, c2, s2, c2} -> 必要 | shufps xmm4,xmm4,R4(0,2,0,2) ; = {s2, c2, s2, c2} -> 必要 |
| |
loopalign 16 | loopalign 16 |
|
|
movss [edi+eax*4],xmm2 | movss [edi+eax*4],xmm2 |
movss [esi+edx*2],xmm0 | movss [esi+edx*2],xmm0 |
lea esi,[esi + eax*8] ; fi += (k1 * 4); | lea esi,[esi + eax*8] ; fi += (k1 * 4); |
cmp esi,ebp |
cmp esi,[esp] |
jl near .lp21 ; while (fi<fn); | jl near .lp21 ; while (fi<fn); |
| |
| |
|
|
shufps xmm0,xmm0,R4(1,1,0,0) ; = {t_s, t_s, t_c, t_c} | shufps xmm0,xmm0,R4(1,1,0,0) ; = {t_s, t_s, t_c, t_c} |
mulps xmm6,xmm0 ; = {c3*ts, s3*ts, s3*tc, c3*tc} | mulps xmm6,xmm0 ; = {c3*ts, s3*ts, s3*tc, c3*tc} |
movhlps xmm4,xmm6 ; = {--, --, c3*ts, s3*ts} | movhlps xmm4,xmm6 ; = {--, --, c3*ts, s3*ts} |
xorps xmm4,[Q_MPMP] ; = {--, --, -c3*ts, s3*ts} |
xorps xmm4,[ebp + Q_MPMP wrt ..gotoff] ; = {--, --, -c3*ts, s3*ts} |
subps xmm6,xmm4 ; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1} | subps xmm6,xmm4 ; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1} |
| |
; c3 = c1*t_c - s1*t_s; | ; c3 = c1*t_c - s1*t_s; |
|
|
shufps xmm6,xmm6,0x14 ; = {c1, s1, s1, c1} | shufps xmm6,xmm6,0x14 ; = {c1, s1, s1, c1} |
mulps xmm0,xmm6 ; = {ts*c1 ts*s1 tc*s1 tc*c1} | mulps xmm0,xmm6 ; = {ts*c1 ts*s1 tc*s1 tc*c1} |
movhlps xmm3,xmm0 | movhlps xmm3,xmm0 |
xorps xmm3,[Q_MPMP] |
xorps xmm3,[ebp + Q_MPMP wrt ..gotoff] |
subps xmm0,xmm3 ; = {--, --, s3, c3} | subps xmm0,xmm3 ; = {--, --, s3, c3} |
| |
; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1} | ; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1} |
|
|
sub edi,ebx ; edi = fz - i/2 | sub edi,ebx ; edi = fz - i/2 |
mulps xmm7, xmm6 ; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2} | mulps xmm7, xmm6 ; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2} |
lea esi,[edi + ebx*2] ; esi = fi = fz +i/2 | lea esi,[edi + ebx*2] ; esi = fi = fz +i/2 |
subps xmm7, [D_1100] ; {-c2, -c4, s4, s2} |
subps xmm7, [ebp + D_1100 wrt ..gotoff] ; {-c2, -c4, s4, s2} |
lea edi,[edi + eax*2-4] ; edi = gi = fz +k1-i/2 | lea edi,[edi + eax*2-4] ; edi = gi = fz +k1-i/2 |
| |
; fi = fz +i; | ; fi = fz +i; |
|
|
; d = s2*fi[k3 ] - c2*gi[k3 ]; | ; d = s2*fi[k3 ] - c2*gi[k3 ]; |
| |
movaps xmm4,xmm7 ; = {-c2 -c4 s4 s2} | movaps xmm4,xmm7 ; = {-c2 -c4 s4 s2} |
xorps xmm4,[Q_MMPP] ; = { c2 c4 s4 s2} |
xorps xmm4,[ebp + Q_MMPP wrt ..gotoff] ; = { c2 c4 s4 s2} |
shufps xmm4,xmm4,0x1B ; = { s2 s4 c4 c2} | shufps xmm4,xmm4,0x1B ; = { s2 s4 c4 c2} |
movlps xmm0,[esi+eax*2] | movlps xmm0,[esi+eax*2] |
movlps xmm1,[edi+eax*2] | movlps xmm1,[edi+eax*2] |
|
|
; fi += k4; | ; fi += k4; |
lea edi,[edi + eax*8] ; gi += (k1 * 4); | lea edi,[edi + eax*8] ; gi += (k1 * 4); |
lea esi,[esi + eax*8] ; fi += (k1 * 4); | lea esi,[esi + eax*8] ; fi += (k1 * 4); |
cmp esi,ebp |
cmp esi,[esp] |
jl near .lp220 ; while (fi<fn); | jl near .lp220 ; while (fi<fn); |
; } while (fi<fn); | ; } while (fi<fn); |
| |
|
|
cmp eax,[esp+_P+8] ; while ((k1 * 4)<n); | cmp eax,[esp+_P+8] ; while ((k1 * 4)<n); |
jle near .lp2 | jle near .lp2 |
pop ebp | pop ebp |
|
pop ebp |
pop edi | pop edi |
pop esi | pop esi |
pop ebx | pop ebx |