View | Details | Raw Unified
Collapse All | Expand All

(-) lame-398-orig/libmp3lame/i386/choose_table.nas (-48 / +65 lines)
 Lines 111-143   choose_table_H Link Here 
	dw	0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17
	dw	0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17
choose_jump_table_L:
choose_jump_table_L:
	dd	table_MMX.L_case_0
	dd	table_MMX.L_case_0    - choose_table_MMX
	dd	table_MMX.L_case_1
	dd	table_MMX.L_case_1    - choose_table_MMX
	dd	table_MMX.L_case_2
	dd	table_MMX.L_case_2    - choose_table_MMX
	dd	table_MMX.L_case_3
	dd	table_MMX.L_case_3    - choose_table_MMX
	dd	table_MMX.L_case_45
	dd	table_MMX.L_case_45   - choose_table_MMX
	dd	table_MMX.L_case_45
	dd	table_MMX.L_case_45   - choose_table_MMX
	dd	table_MMX.L_case_67
	dd	table_MMX.L_case_67   - choose_table_MMX
	dd	table_MMX.L_case_67
	dd	table_MMX.L_case_67   - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	dd	table_MMX.L_case_8_15
	dd	table_MMX.L_case_8_15 - choose_table_MMX
	segment_code
	segment_code
;
;
; use MMX
; use MMX
;
;
extern  _GLOBAL_OFFSET_TABLE_
get_pc.bp:
	mov ebp, [esp]
	retn
	align	16
	align	16
; int choose_table(int *ix, int *end, int *s)
; int choose_table(int *ix, int *end, int *s)
choose_table_MMX:
choose_table_MMX:
	mov	ecx,[esp+4]	;ecx = begin
	push	ebp
	mov	edx,[esp+8]	;edx = end
	call	get_pc.bp
	add	ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
	mov	ecx,[esp+8]	;ecx = begin
	mov	edx,[esp+12]	;edx = end
	sub	ecx,edx		;ecx = begin-end(should be minus)
	sub	ecx,edx		;ecx = begin-end(should be minus)
	test	ecx,8
	test	ecx,8
 	pxor	mm0,mm0		;mm0=[0:0]
 	pxor	mm0,mm0		;mm0=[0:0]
 Lines 169-181   choose_table_MMX: Link Here 
	cmp	eax,15
	cmp	eax,15
	ja	.with_ESC
	ja	.with_ESC
	jmp	[choose_jump_table_L+eax*4]
	lea	ecx,[ebp + choose_table_MMX wrt ..gotoff]
	add	ecx,[ebp + choose_jump_table_L+eax*4 wrt ..gotoff]
	jmp 	ecx
.with_ESC1:
.with_ESC1:
	emms
	emms
	mov	ecx, [esp+12]	; *s
	mov	ecx, [esp+16]	; *s
	mov	[ecx], eax
	mov	[ecx], eax
	or	eax,-1
	or	eax,-1
	pop	ebp
	ret
	ret
.with_ESC:
.with_ESC:
 Lines 187-198   choose_table_MMX: Link Here 
	push	esi
	push	esi
	bsr	eax, eax
	bsr	eax, eax
%assign _P 4*2
%assign _P 4*2
	movq    mm5, [D15_15_15_15]
	movq    mm5, [ebp + D15_15_15_15 wrt ..gotoff]
	movq	mm6, [D14_14_14_14]
	movq	mm6, [ebp + D14_14_14_14 wrt ..gotoff]
	movq	mm3, [mul_add]
	movq	mm3, [ebp + mul_add wrt ..gotoff]
	mov	ecx, [esp+_P+4]		; = ix
	mov	ecx, [esp+_P+8]		; = ix
;	mov	edx, [esp+_P+8]		; = end
;	mov	edx, [esp+_P+12]	; = end
	sub	ecx, edx
	sub	ecx, edx
	xor	esi, esi	; sum = 0
	xor	esi, esi	; sum = 0
 Lines 209-215   choose_table_MMX: Link Here 
	psubw	mm7, mm2	; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
	psubw	mm7, mm2	; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
	pmaddwd	mm0, mm3	; {0, 0, y, x}*{1, 16, 1, 16}
	pmaddwd	mm0, mm3	; {0, 0, y, x}*{1, 16, 1, 16}
	movd	ebx, mm0
	movd	ebx, mm0
	mov	esi, [largetbl+ebx*4+(16*16+16)*4]
	mov	esi, [ebp + largetbl+ebx*4+(16*16+16)*4 wrt ..gotoff]
	jz	.H_dual_exit
	jz	.H_dual_exit
 Lines 224-232   choose_table_MMX: Link Here 
	pmaddwd	mm0, mm3	; {y, x, y, x}*{1, 16, 1, 16}
	pmaddwd	mm0, mm3	; {y, x, y, x}*{1, 16, 1, 16}
	movd	ebx, mm0
	movd	ebx, mm0
	punpckhdq	mm0,mm0
	punpckhdq	mm0,mm0
	add	esi, [largetbl+ebx*4+(16*16+16)*4]
	add	esi, [ebp + largetbl+ebx*4+(16*16+16)*4 wrt ..gotoff]
	movd	ebx, mm0
	movd	ebx, mm0
	add	esi, [largetbl+ebx*4+(16*16+16)*4]
	add	esi, [ebp + largetbl+ebx*4+(16*16+16)*4 wrt ..gotoff]
	add	ecx, 16
	add	ecx, 16
	psubw	mm7, mm2	; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
	psubw	mm7, mm2	; 14$B$h$jBg$-$$$H$-(B linbits_sum++;
	jnz	.H_dual_lp1
	jnz	.H_dual_lp1
 Lines 237-244   choose_table_MMX: Link Here 
	paddd	mm7,mm1
	paddd	mm7,mm1
	punpckldq	mm7,mm7
	punpckldq	mm7,mm7
	pmaddwd	mm7, [linbits32+eax*8]	; linbits
	pmaddwd	mm7, [ebp + linbits32+eax*8 wrt ..gotoff]	; linbits
	mov	ax, [choose_table_H+eax*2]
	mov	ax, [ebp + choose_table_H+eax*2 wrt ..gotoff]
	movd	ecx, mm7
	movd	ecx, mm7
	punpckhdq	mm7,mm7
	punpckhdq	mm7,mm7
 Lines 261-314   choose_table_MMX: Link Here 
	mov	edx, ecx
	mov	edx, ecx
	shr	eax, 8
	shr	eax, 8
.chooseE_s1:
.chooseE_s1:
	mov	ecx, [esp+12] ; *s
	mov	ecx, [esp+16] ; *s
	and	eax, 0xff
	and	eax, 0xff
	add	[ecx], edx
	add	[ecx], edx
	pop	ebp
	ret
	ret
table_MMX.L_case_0:
table_MMX.L_case_0:
	emms
	emms
	pop	ebp
	ret
	ret
table_MMX.L_case_1:
table_MMX.L_case_1:
	emms
	emms
	mov	eax, [esp+12] ; *s
	mov	eax, [esp+16] ; *s
	mov	ecx, [esp+4] ; *ix
	mov	ecx, [esp+8] ; *ix
	sub	ecx, edx
	sub	ecx, edx
	push	ebx
	push	ebx
.lp:
.lp:
	mov	ebx, [edx+ecx]
	mov	ebx, [edx+ecx]
	add	ebx, ebx
	add	ebx, ebx
	add	ebx, [edx+ecx+4]
	add	ebx, [edx+ecx+4]
	movzx	ebx, byte [ebx+t1l]
	movzx	ebx, byte [ebp + ebx+t1l wrt ..gotoff]
	add	[eax], ebx
	add	[eax], ebx
	add	ecx, 8
	add	ecx, 8
	jnz	.lp
	jnz	.lp
	pop	ebx
	pop	ebx
	mov	eax, 1
	mov	eax, 1
	pop	ebp
	ret
	ret
table_MMX.L_case_45:
table_MMX.L_case_45:
	push	dword 7
	push	dword 7
	mov	ecx, tableABC+9*8
	lea	ecx, [ebp + tableABC+9*8 wrt ..gotoff]
	jmp	from3
	jmp	from3
table_MMX.L_case_67:
table_MMX.L_case_67:
	push	dword 10
	push	dword 10
	mov	ecx, tableABC
	lea	ecx, [ebp + tableABC wrt ..gotoff]
	jmp	from3
	jmp	from3
table_MMX.L_case_8_15:
table_MMX.L_case_8_15:
	push	dword 13
	push	dword 13
	mov	ecx, tableDEF
	lea	ecx, [ebp + tableDEF wrt ..gotoff]
from3:
from3:
	mov	eax,[esp+8]	;eax = *begin
	mov	eax,[esp+12]	;eax = *begin
;	mov	edx,[esp+12]	;edx = *end
;	mov	edx,[esp+16]	;edx = *end
	push	ebx
	push	ebx
	sub	eax, edx
	sub	eax, edx
	movq	mm5,[mul_add]
	movq	mm5,[ebp + mul_add wrt ..gotoff]
	pxor	mm2,mm2	;mm2 = sum
	pxor	mm2,mm2	;mm2 = sum
	test	eax, 8
	test	eax, 8
 Lines 361-382   from3: Link Here 
.choose3_s2:
.choose3_s2:
	pop	ecx
	pop	ecx
	add	eax, ecx
	add	eax, ecx
	mov	ecx, [esp+12] ; *s
	mov	ecx, [esp+16] ; *s
	add	[ecx], edx
	add	[ecx], edx
	pop	ebp
	ret
	ret
table_MMX.L_case_2:
table_MMX.L_case_2:
	push	dword 2
	push	dword 2
	mov	ecx,table23
	lea	ecx,[ebp + table23 wrt ..gotoff]
	pmov	mm5,[mul_add23]
	pmov	mm5,[ebp + mul_add23 wrt ..gotoff]
	jmp	from2
	jmp	from2
table_MMX.L_case_3:
table_MMX.L_case_3:
	push	dword 5
	push	dword 5
	mov	ecx,table56
	lea	ecx,[ebp + table56 wrt ..gotoff]
	pmov	mm5,[mul_add56]
	pmov	mm5,[ebp + mul_add56 wrt ..gotoff]
from2:
from2:
	mov	eax,[esp+8]	;eax = *begin
	mov	eax,[esp+12]	;eax = *begin
;	mov	edx,[esp+12]	;edx = *end
;	mov	edx,[esp+16]	;edx = *end
	push	ebx
	push	ebx
	push	edi
	push	edi
 Lines 426-433   from2: Link Here 
	mov	edx, ecx
	mov	edx, ecx
	inc	eax
	inc	eax
.choose2_s1:
.choose2_s1:
	mov	ecx, [esp+12] ; *s
	mov	ecx, [esp+16] ; *s
	add	[ecx], edx
	add	[ecx], edx
	pop	ebp
	ret
	ret
	end
	end
(-) lame-398-orig/libmp3lame/i386/fft3dn.nas (-29 / +42 lines)
 Lines 24-49   D_1_0_0_0 dd 0.0 , 1.0 Link Here 
	segment_code
	segment_code
extern  _GLOBAL_OFFSET_TABLE_
get_pc.bp:
	mov ebp, [esp]
	retn
;void fht_3DN(float *fz, int nn);
;void fht_3DN(float *fz, int nn);
proc	fht_3DN
proc	fht_3DN
	pushd	ebp, ebx, esi, edi
	pushd	ebp, ebx, esi, edi
	mov	r0, [esp+20]		;fi
	sub	esp, 20
	mov	r1, [esp+24]		;r1 = nn
	sub	esp, 16
	call	get_pc.bp
	add	ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
	mov	r0, [esp+40]		;fi
	mov	r1, [esp+44]		;r1 = nn
	lea	r3, [ebp + costab wrt ..gotoff]		;tri = costab
	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
	mov	[esp+16], r4
	mov	r4, 8			;kx = k1/2
	mov	r4, 8			;kx = k1/2
	mov	r3, costab		;tri = costab
	lea	r6, [r0+r1*8]		;r6 = fn = &fz[n]
	pmov	mm7, [r3]
	pmov	mm7, [r3]
	loopalign 16
	loopalign 16
.do1
.do1
	lea	r3, [r3+16]	;tri += 2;
	lea	r3, [r3+16]	;tri += 2;
	pmov	mm6, [costab+8]
	pmov	mm6, [ebp + costab+8 wrt ..gotoff]
	lea	r2, [r4+r4*2]		;k3*fsize/2
	lea	r2, [r4+r4*2]		;k3*fsize/2
	mov	r5, 4		;i = 1*fsize
	mov	r5, 4		;i = 1*fsize
 Lines 104-110   proc fht_3DN Link Here 
	pmovd	[r1+r4*4], mm4	;gi[k2]
	pmovd	[r1+r4*4], mm4	;gi[k2]
	puphdq	mm4, mm4
	puphdq	mm4, mm4
	cmp	r0, r6
	cmp	r0, [esp + 16]
	pmovd	[r1+r4*2], mm0	;gi[k1]
	pmovd	[r1+r4*2], mm0	;gi[k1]
	pmovd	[r1+r2*2], mm4	;gi[k3]
	pmovd	[r1+r2*2], mm4	;gi[k3]
 Lines 119-130   proc fht_3DN Link Here 
; mm7 = 0x800000000 | 0
; mm7 = 0x800000000 | 0
;
;
	pmov	mm1, mm6
	pmov	mm1, mm6
	mov	r0, [esp+36]	; fz
	mov	r0, [esp+40]	; fz
	puphdq	mm1, mm1	; c1 | c1
	puphdq	mm1, mm1	; c1 | c1
	lea	r1, [r0+r4*2]
	lea	r1, [r0+r4*2]
	pfadd	mm1, mm1	; c1+c1 | c1+c1
	pfadd	mm1, mm1	; c1+c1 | c1+c1
	pfmul	mm1, mm6	; 2*c1*c1 | 2*c1*s1
	pfmul	mm1, mm6	; 2*c1*c1 | 2*c1*s1
	pfsub	mm1, [D_1_0_0_0] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
	pfsub	mm1, [ebp + D_1_0_0_0 wrt ..gotoff] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
	pmov	mm0, mm1
	pmov	mm0, mm1
	pxor	mm7, mm6	; c1 | -s1
	pxor	mm7, mm6	; c1 | -s1
 Lines 134-140   proc fht_3DN Link Here 
	puphdq	mm0, mm2	; s2 | c2
	puphdq	mm0, mm2	; s2 | c2
	puphdq	mm6, mm3	;-s1 | c1
	puphdq	mm6, mm3	;-s1 | c1
	pxor	mm0, [costab]	; c2 | -s2
	pxor	mm0, [ebp + costab wrt ..gotoff]	; c2 | -s2
; mm0 =  s2| c2
; mm0 =  s2| c2
; mm1 = -c2| s2
; mm1 = -c2| s2
 Lines 233-239   proc fht_3DN Link Here 
	lea	r0, [r0+r4*8]
	lea	r0, [r0+r4*8]
	lea	r1, [r1+r4*8]
	lea	r1, [r1+r4*8]
	cmp	r0, r6
	cmp	r0, [esp + 16]
	pmov	mm0, [esp]
	pmov	mm0, [esp]
	pmov	mm1, [esp+8]
	pmov	mm1, [esp+8]
 Lines 249-265   proc fht_3DN Link Here 
	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
	pupldq	mm7,mm6
	pupldq	mm7,mm6
	puphdq	mm6,mm7
	puphdq	mm6,mm7
	pmov	mm7, [costab]
	pmov	mm7, [ebp + costab wrt ..gotoff]
	jb near	.for
	jb near	.for
	mov	r0, [esp+36]	;fi
	mov	r0, [esp+40]	;fi
	cmp	r4, [esp+36+4]
	cmp	r4, [esp+40+4]
	lea	r4, [r4*4]	;kx *= 4
	lea	r4, [r4*4]	;kx *= 4
	jb near	.do1
	jb near	.do1
.exitttt
.exitttt
	femms
	femms
	add	esp,16
	add	esp,20
	popd	ebp, ebx, esi, edi
	popd	ebp, ebx, esi, edi
endproc
endproc
 Lines 270-289   proc fht_E3DN Link Here 
	pushd	ebp, ebx, esi, edi
	pushd	ebp, ebx, esi, edi
	mov	r0, [esp+20]		;fi
	sub	esp, 20
	mov	r1, [esp+24]		;r1 = nn
	sub	esp, 16
	call	get_pc.bp
	add	ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
	mov	r0, [esp+40]		;fi
	mov	r1, [esp+44]		;r1 = nn
	lea	r3, [ebp + costab wrt ..gotoff]		;tri = costab
	lea	r4, [r0+r1*8]		;r4 = fn = &fz[n]
	mov	[esp+16], r4
	mov	r4, 8			;kx = k1/2
	mov	r4, 8			;kx = k1/2
	mov	r3, costab		;tri = costab
	lea	r6, [r0+r1*8]		;r6 = fn = &fz[n]
	pmov	mm7, [r3]
	pmov	mm7, [r3]
	loopalign 16
	loopalign 16
.do1
.do1
	lea	r3, [r3+16]	;tri += 2;
	lea	r3, [r3+16]	;tri += 2;
	pmov	mm6, [costab+8]
	pmov	mm6, [ebp + costab+8 wrt ..gotoff]
	lea	r2, [r4+r4*2]		;k3*fsize/2
	lea	r2, [r4+r4*2]		;k3*fsize/2
	mov	r5, 4		;i = 1*fsize
	mov	r5, 4		;i = 1*fsize
 Lines 324-330   proc fht_E3DN Link Here 
	pfadd	mm3, mm4	;f0+f2|f1+f3
	pfadd	mm3, mm4	;f0+f2|f1+f3
	pfsub	mm5, mm4	;f0-f2|f1-f3
	pfsub	mm5, mm4	;f0-f2|f1-f3
	cmp	r0, r6
	cmp	r0, [esp + 16]
	pmovd	[r1+r4*2], mm3	;gi[k1]
	pmovd	[r1+r4*2], mm3	;gi[k1]
	pmovd	[r1+r2*2], mm5	;gi[k3]
	pmovd	[r1+r2*2], mm5	;gi[k3]
	puphdq	mm3, mm3
	puphdq	mm3, mm3
 Lines 343-354   proc fht_E3DN Link Here 
; mm7 = 0x800000000 | 0
; mm7 = 0x800000000 | 0
;
;
	pmov	mm5, mm6
	pmov	mm5, mm6
	mov	r0, [esp+36]	; fz
	mov	r0, [esp+40]	; fz
	puphdq	mm5, mm5	; c1 | c1
	puphdq	mm5, mm5	; c1 | c1
	lea	r1, [r0+r4*2]
	lea	r1, [r0+r4*2]
	pfadd	mm5, mm5	; c1+c1 | c1+c1
	pfadd	mm5, mm5	; c1+c1 | c1+c1
	pfmul	mm5, mm6	; 2*c1*c1 | 2*c1*s1
	pfmul	mm5, mm6	; 2*c1*c1 | 2*c1*s1
	pfsub	mm5, [D_1_0_0_0] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
	pfsub	mm5, [ebp + D_1_0_0_0 wrt ..gotoff] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
	pswapd	mm4, mm5	; s2 |-c2
	pswapd	mm4, mm5	; s2 |-c2
	pxor	mm4, mm7	; s2 | c2
	pxor	mm4, mm7	; s2 | c2
 Lines 447-453   proc fht_E3DN Link Here 
	lea	r0, [r0+r4*8]
	lea	r0, [r0+r4*8]
	lea	r1, [r1+r4*8]
	lea	r1, [r1+r4*8]
	cmp	r0, r6
	cmp	r0, [esp + 16]
	pmov	mm4, [esp]
	pmov	mm4, [esp]
	pmov	mm5, [esp+8]
	pmov	mm5, [esp+8]
 Lines 462-477   proc fht_E3DN Link Here 
	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
	pfsub	mm6, mm7	; c1*a-s1*b | s1*a+c1*b
	pswapd	mm6, mm6 ; ???	; s1*a+c1*b | c1*a-s1*b
	pswapd	mm6, mm6 ; ???	; s1*a+c1*b | c1*a-s1*b
	pmov	mm7, [costab]
	pmov	mm7, [ebp + costab wrt ..gotoff]
	jb near	.for
	jb near	.for
	mov	r0, [esp+36]	;fi
	mov	r0, [esp+40]	;fi
	cmp	r4, [esp+36+4]
	cmp	r4, [esp+40+4]
	lea	r4, [r4*4]	;kx *= 4
	lea	r4, [r4*4]	;kx *= 4
	jb near	.do1
	jb near	.do1
.exitttt
.exitttt
	femms
	femms
	add	esp,16
	add	esp,20
	popd	ebp, ebx, esi, edi
	popd	ebp, ebx, esi, edi
endproc
endproc
(-) lame-398-orig/libmp3lame/i386/fftsse.nas (-14 / +26 lines)
 Lines 25-30   costab_fft: Link Here 
S_SQRT2	dd	1.414213562
S_SQRT2	dd	1.414213562
	segment_code
	segment_code
extern  _GLOBAL_OFFSET_TABLE_
get_pc.bp:
	mov ebp, [esp]
	retn
;------------------------------------------------------------------------
;------------------------------------------------------------------------
;	by K. SAKAI
;	by K. SAKAI
;	99/08/18	PIII 23k[clk]
;	99/08/18	PIII 23k[clk]
 Lines 40-54   fht_SSE: Link Here 
	push	esi
	push	esi
	push	edi
	push	edi
	push	ebp
	push	ebp
%assign _P 4*4
%assign _P 4*5
	;2つ目のループ
	;2つ目のループ
	mov	eax,[esp+_P+4]	;eax=fz
	mov	eax,[esp+_P+4]	;eax=fz
	mov	ebp,[esp+_P+8]	;=n
	mov	ebp,[esp+_P+8]	;=n
	shl	ebp,3
	shl	ebp,3
	add	ebp,eax		; fn  = fz + n, この関数終了まで不変
	add	ebp,eax		; fn  = fz + n, この関数終了まで不変
	push	ebp
	call	get_pc.bp
	add	ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc
	lea	ecx,[costab_fft]
	lea	ecx,[ebp + costab_fft wrt ..gotoff]
	xor	eax,eax
	xor	eax,eax
	mov	al,8		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
	mov	al,8		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
.lp2:				; do{
.lp2:				; do{
 Lines 101-112   fht_SSE: Link Here 
;                       gi[k3] = g1     - g3;
;                       gi[k3] = g1     - g3;
	fld	dword [edi]
	fld	dword [edi]
	fadd	dword [edi+eax*2]
	fadd	dword [edi+eax*2]
	fld	dword [S_SQRT2]
	fld	dword [ebp + S_SQRT2 wrt ..gotoff]
	fmul	dword [edi+eax*4]
	fmul	dword [edi+eax*4]
	fld	dword [edi]
	fld	dword [edi]
	fsub	dword [edi+eax*2]
	fsub	dword [edi+eax*2]
	fld	dword [S_SQRT2]
	fld	dword [ebp + S_SQRT2 wrt ..gotoff]
	fmul	dword [edi+edx*2]
	fmul	dword [edi+edx*2]
	fld	st1
	fld	st1
 Lines 121-127   fht_SSE: Link Here 
	fsubp	st1,st0
	fsubp	st1,st0
	fstp	dword [edi+eax*4]
	fstp	dword [edi+eax*4]
	cmp	ebx,ebp
	cmp	ebx,[esp]
	jl	near .lp20		; while (fi<fn);
	jl	near .lp20		; while (fi<fn);
 Lines 136-152   fht_SSE: Link Here 
;                       s2 = c1*s1 + s1*c1 = 2*s1*c1;
;                       s2 = c1*s1 + s1*c1 = 2*s1*c1;
	shufps	xmm7,xmm7,R4(1,0,0,1)
	shufps	xmm7,xmm7,R4(1,0,0,1)
	movss	xmm5,xmm7		; = { --,  --,  --, s1}
	movss	xmm5,xmm7		; = { --,  --,  --, s1}
	xorps	xmm7,[Q_MMPP]	; = {-s1, -c1, +c1, +s1} -> 必要
	xorps	xmm7,[ebp + Q_MMPP wrt ..gotoff]	; = {-s1, -c1, +c1, +s1} -> 必要
	addss	xmm5,xmm5		; = (--, --,  --, 2*s1)
	addss	xmm5,xmm5		; = (--, --,  --, 2*s1)
	add	esi,4		; esi = fi = fz + i
	add	esi,4		; esi = fi = fz + i
	shufps	xmm5,xmm5,R4(0,0,0,0)	; = (2*s1, 2*s1, 2*s1, 2*s1)
	shufps	xmm5,xmm5,R4(0,0,0,0)	; = (2*s1, 2*s1, 2*s1, 2*s1)
	mulps	xmm5,xmm6		; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1)
	mulps	xmm5,xmm6		; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1)
	subps	xmm5,[D_1100]		; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2}
	subps	xmm5,[ebp + D_1100 wrt ..gotoff]		; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2}
	movaps	xmm4,xmm5
	movaps	xmm4,xmm5
	shufps	xmm5,xmm5,R4(2,0,2,0)	; = {-c2, s2, -c2, s2} -> 必要
	shufps	xmm5,xmm5,R4(2,0,2,0)	; = {-c2, s2, -c2, s2} -> 必要
	xorps	xmm4,[Q_MMPP]		; = {--, c2, --, s2}
	xorps	xmm4,[ebp + Q_MMPP wrt ..gotoff]		; = {--, c2, --, s2}
	shufps	xmm4,xmm4,R4(0,2,0,2)	; = {s2, c2, s2, c2} -> 必要
	shufps	xmm4,xmm4,R4(0,2,0,2)	; = {s2, c2, s2, c2} -> 必要
	loopalign	16
	loopalign	16
 Lines 222-228   fht_SSE: Link Here 
	movss	[edi+eax*4],xmm2
	movss	[edi+eax*4],xmm2
	movss	[esi+edx*2],xmm0
	movss	[esi+edx*2],xmm0
	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
	cmp	esi,ebp
	cmp	esi,[esp]
	jl	near .lp21		; while (fi<fn);
	jl	near .lp21		; while (fi<fn);
 Lines 247-253   fht_SSE: Link Here 
	shufps	xmm0,xmm0,R4(1,1,0,0)	; = {t_s, t_s, t_c, t_c}
	shufps	xmm0,xmm0,R4(1,1,0,0)	; = {t_s, t_s, t_c, t_c}
	mulps	xmm6,xmm0	; = {c3*ts, s3*ts, s3*tc, c3*tc}
	mulps	xmm6,xmm0	; = {c3*ts, s3*ts, s3*tc, c3*tc}
	movhlps	xmm4,xmm6	; = {--,    --,    c3*ts, s3*ts}
	movhlps	xmm4,xmm6	; = {--,    --,    c3*ts, s3*ts}
	xorps	xmm4,[Q_MPMP]	; = {--,    --,   -c3*ts, s3*ts}
	xorps	xmm4,[ebp + Q_MPMP wrt ..gotoff]	; = {--,    --,   -c3*ts, s3*ts}
	subps	xmm6,xmm4	; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1}
	subps	xmm6,xmm4	; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1}
;                       c3 = c1*t_c - s1*t_s;
;                       c3 = c1*t_c - s1*t_s;
 Lines 255-261   fht_SSE: Link Here 
	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
	mulps	xmm0,xmm6	; = {ts*c1 ts*s1 tc*s1 tc*c1}
	mulps	xmm0,xmm6	; = {ts*c1 ts*s1 tc*s1 tc*c1}
	movhlps	xmm3,xmm0
	movhlps	xmm3,xmm0
	xorps	xmm3,[Q_MPMP]
	xorps	xmm3,[ebp + Q_MPMP wrt ..gotoff]
	subps	xmm0,xmm3	; = {--, --, s3, c3}
	subps	xmm0,xmm3	; = {--, --, s3, c3}
; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1}
; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1}
 Lines 268-274   fht_SSE: Link Here 
	sub	edi,ebx			; edi = fz - i/2
	sub	edi,ebx			; edi = fz - i/2
	mulps	xmm7, xmm6		; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2}
	mulps	xmm7, xmm6		; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2}
	lea	esi,[edi + ebx*2]	; esi = fi = fz +i/2
	lea	esi,[edi + ebx*2]	; esi = fi = fz +i/2
	subps	xmm7, [D_1100]		; {-c2, -c4, s4, s2}
	subps	xmm7, [ebp + D_1100 wrt ..gotoff]		; {-c2, -c4, s4, s2}
	lea	edi,[edi + eax*2-4]	; edi = gi = fz +k1-i/2
	lea	edi,[edi + eax*2-4]	; edi = gi = fz +k1-i/2
;                       fi = fz +i;
;                       fi = fz +i;
 Lines 286-292   fht_SSE: Link Here 
;                               d       = s2*fi[k3  ] - c2*gi[k3  ];
;                               d       = s2*fi[k3  ] - c2*gi[k3  ];
	movaps	xmm4,xmm7	; = {-c2 -c4  s4  s2}
	movaps	xmm4,xmm7	; = {-c2 -c4  s4  s2}
	xorps	xmm4,[Q_MMPP]	; = { c2  c4  s4  s2}
	xorps	xmm4,[ebp + Q_MMPP wrt ..gotoff]	; = { c2  c4  s4  s2}
	shufps	xmm4,xmm4,0x1B	; = { s2  s4  c4  c2}
	shufps	xmm4,xmm4,0x1B	; = { s2  s4  c4  c2}
	movlps	xmm0,[esi+eax*2]
	movlps	xmm0,[esi+eax*2]
	movlps	xmm1,[edi+eax*2]
	movlps	xmm1,[edi+eax*2]
 Lines 390-396   fht_SSE: Link Here 
;                               fi     += k4;
;                               fi     += k4;
	lea	edi,[edi + eax*8] ; gi += (k1 * 4);
	lea	edi,[edi + eax*8] ; gi += (k1 * 4);
	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
	cmp	esi,ebp
	cmp	esi,[esp]
	jl	near .lp220		; while (fi<fn);
	jl	near .lp220		; while (fi<fn);
;                       } while (fi<fn);
;                       } while (fi<fn);
 Lines 405-410   fht_SSE: Link Here 
	cmp	eax,[esp+_P+8]	; while ((k1 * 4)<n);
	cmp	eax,[esp+_P+8]	; while ((k1 * 4)<n);
	jle	near .lp2
	jle	near .lp2
	pop	ebp
	pop	ebp
	pop	ebp
	pop	edi
	pop	edi
	pop	esi
	pop	esi
	pop	ebx
	pop	ebx