mirror of
				https://github.com/cookiengineer/audacity
				synced 2025-10-26 15:23:48 +01:00 
			
		
		
		
	Using LAME 3.10 Windows project files substantially changed from original, and included into audacity solution.
		
			
				
	
	
		
			423 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			423 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| ; back port from GOGO-no coda 2.24b by Takehiro TOMINAGA
 | |
| 
 | |
| ; GOGO-no-coda
 | |
| ;	Copyright (C) 1999 shigeo
 | |
| ;	special thanks to Keiichi SAKAI
 | |
|  
 | |
| %include "nasm.h"
 | |
| 
 | |
| 	globaldef fht_SSE
 | |
| 
 | |
| 	segment_data
 | |
| 	align 16
 | |
| Q_MMPP	dd	0x0,0x0,0x80000000,0x80000000
 | |
| Q_MPMP	dd	0x0,0x80000000,0x0,0x80000000
 | |
| D_1100	dd 0.0, 0.0, 1.0, 1.0
 | |
| costab_fft:
 | |
| 	dd 9.238795325112867e-01
 | |
| 	dd 3.826834323650898e-01
 | |
| 	dd 9.951847266721969e-01
 | |
| 	dd 9.801714032956060e-02
 | |
| 	dd 9.996988186962042e-01
 | |
| 	dd 2.454122852291229e-02
 | |
| 	dd 9.999811752836011e-01
 | |
| 	dd 6.135884649154475e-03
 | |
| S_SQRT2	dd	1.414213562
 | |
| 
 | |
| 	segment_code
 | |
| 
 | |
| PIC_OFFSETTABLE
 | |
| 
 | |
| ;------------------------------------------------------------------------
 | |
| ;	by K. SAKAI
 | |
| ;	99/08/18	PIII 23k[clk]
 | |
| ;	99/08/19	命令順序入れ換え PIII 22k[clk]
 | |
| ;	99/08/20	bit reversal を旧午後から移植した PIII 17k[clk]
 | |
| ;	99/08/23	一部 unroll PIII 14k[clk]
 | |
| ;	99/11/12	clean up
 | |
| ;
 | |
| ;void fht_SSE(float *fz, int n);
 | |
| 	align 16
 | |
| fht_SSE:
 | |
| 	push	ebx
 | |
| 	push	esi
 | |
| 	push	edi
 | |
| 	push	ebp
 | |
| 
 | |
| %assign _P 4*5
 | |
| 
 | |
| 	;2つ目のループ
 | |
| 	mov	eax,[esp+_P+0]	;eax=fz
 | |
| 	mov	ebp,[esp+_P+4]	;=n
 | |
| 	shl	ebp,3
 | |
| 	add	ebp,eax		; fn  = fz + n, この関数終了まで不変
 | |
| 	push	ebp
 | |
| 
 | |
| 	call	get_pc.bp
 | |
| 	add	ebp, PIC_BASE()
 | |
| 
 | |
| 	lea	ecx,[PIC_EBP_REL(costab_fft)]
 | |
| 	xor	eax,eax
 | |
| 	mov	al,8		; =k1=1*(sizeof float)	// 4, 16, 64, 256,...
 | |
| .lp2:				; do{
 | |
| 	mov	esi,[esp+_P+4]	; esi=fi=fz
 | |
| 	lea	edx,[eax+eax*2]
 | |
| 	mov	ebx, esi
 | |
| 
 | |
| ; たかだか2並列しか期待できない部分はFPUのほうが速い。
 | |
| 	loopalign	16
 | |
| .lp20:				; do{
 | |
| ;                       f0     = fi[0 ] + fi[k1];
 | |
| ;                       f2     = fi[k2] + fi[k3];
 | |
| ;                       f1     = fi[0 ] - fi[k1];
 | |
| ;                       f3     = fi[k2] - fi[k3];
 | |
| ;                       fi[0 ] = f0     + f2;
 | |
| ;                       fi[k1] = f1     + f3;
 | |
| ;                       fi[k2] = f0     - f2;
 | |
| ;                       fi[k3] = f1     - f3;
 | |
| 	lea	edi,[ebx+eax]	; edi=gi=fi+ki/2
 | |
| 	fld	dword [ebx]
 | |
| 	fadd	dword [ebx+eax*2]
 | |
| 	fld	dword [ebx+eax*4]
 | |
| 	fadd	dword [ebx+edx*2]
 | |
| 
 | |
| 	fld	dword [ebx]
 | |
| 	fsub	dword [ebx+eax*2]
 | |
| 	fld	dword [ebx+eax*4]
 | |
| 	fsub	dword [ebx+edx*2]
 | |
| 
 | |
| 	fld	st1
 | |
| 	fadd	st0,st1
 | |
| 	fstp	dword [ebx+eax*2]
 | |
| 	fsubp	st1,st0
 | |
| 	fstp	dword [ebx+edx*2]
 | |
| 
 | |
| 	fld	st1
 | |
| 	fadd	st0,st1
 | |
| 	fstp	dword [ebx]
 | |
| 	fsubp	st1,st0
 | |
| 	fstp	dword [ebx+eax*4]
 | |
| 
 | |
| 	lea	ebx,[ebx + eax*8]	; = fi += (k1 * 4);
 | |
| ;                       g0     = gi[0 ] + gi[k1];
 | |
| ;                       g2     = SQRT2  * gi[k2];
 | |
| ;                       g1     = gi[0 ] - gi[k1];
 | |
| ;                       g3     = SQRT2  * gi[k3];
 | |
| ;                       gi[0 ] = g0     + g2;
 | |
| ;                       gi[k2] = g0     - g2;
 | |
| ;                       gi[k1] = g1     + g3;
 | |
| ;                       gi[k3] = g1     - g3;
 | |
| 	fld	dword [edi]
 | |
| 	fadd	dword [edi+eax*2]
 | |
| 	fld	dword [PIC_EBP_REL(S_SQRT2)]
 | |
| 	fmul	dword [edi+eax*4]
 | |
| 
 | |
| 	fld	dword [edi]
 | |
| 	fsub	dword [edi+eax*2]
 | |
| 	fld	dword [PIC_EBP_REL(S_SQRT2)]
 | |
| 	fmul	dword [edi+edx*2]
 | |
| 
 | |
| 	fld	st1
 | |
| 	fadd	st0,st1
 | |
| 	fstp	dword [edi+eax*2]
 | |
| 	fsubp	st1,st0
 | |
| 	fstp	dword [edi+edx*2]
 | |
| 
 | |
| 	fld	st1
 | |
| 	fadd	st0,st1
 | |
| 	fstp	dword [edi]
 | |
| 	fsubp	st1,st0
 | |
| 	fstp	dword [edi+eax*4]
 | |
| 
 | |
| 	cmp	ebx,[esp]
 | |
| 	jl	near .lp20		; while (fi<fn);
 | |
| 
 | |
| 
 | |
| ;               i = 1; //for (i=1;i<kx;i++){
 | |
| ;                       c1 = 1.0*t_c - 0.0*t_s;
 | |
| ;                       s1 = 0.0*t_c + 1.0*t_s;
 | |
| 	movlps	xmm6,[ecx] ; = { --,  --,  s1, c1}
 | |
| 	movaps	xmm7,xmm6
 | |
| 
 | |
| 	shufps	xmm6,xmm6,R4(0,1,1,0)	; = {+c1, +s1, +s1, +c1} -> 必要
 | |
| ;                       c2 = c1*c1 - s1*s1 = 1 - (2*s1)*s1;
 | |
| ;                       s2 = c1*s1 + s1*c1 = 2*s1*c1;
 | |
| 	shufps	xmm7,xmm7,R4(1,0,0,1)
 | |
| 	movss	xmm5,xmm7		; = { --,  --,  --, s1}
 | |
| 	xorps	xmm7,[PIC_EBP_REL(Q_MMPP)]	; = {-s1, -c1, +c1, +s1} -> 必要
 | |
| 
 | |
| 	addss	xmm5,xmm5		; = (--, --,  --, 2*s1)
 | |
| 	add	esi,4		; esi = fi = fz + i
 | |
| 	shufps	xmm5,xmm5,R4(0,0,0,0)	; = (2*s1, 2*s1, 2*s1, 2*s1)
 | |
| 	mulps	xmm5,xmm6		; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1)
 | |
| 	subps	xmm5,[PIC_EBP_REL(D_1100)]		; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2}
 | |
| 	movaps	xmm4,xmm5
 | |
| 	shufps	xmm5,xmm5,R4(2,0,2,0)	; = {-c2, s2, -c2, s2} -> 必要
 | |
| 
 | |
| 	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]		; = {--, c2, --, s2}
 | |
| 	shufps	xmm4,xmm4,R4(0,2,0,2)	; = {s2, c2, s2, c2} -> 必要
 | |
| 
 | |
| 	loopalign	16
 | |
| .lp21:				; do{
 | |
| ;                               a       = c2*fi[k1] + s2*gi[k1];
 | |
| ;                               b       = s2*fi[k1] - c2*gi[k1];
 | |
| ;                               c       = c2*fi[k3] + s2*gi[k3];
 | |
| ;                               d       = s2*fi[k3] - c2*gi[k3];
 | |
| ;                               f0      = fi[0 ]        + a;
 | |
| ;                               g0      = gi[0 ]        + b;
 | |
| ;                               f2      = fi[k1 * 2]    + c;
 | |
| ;                               g2      = gi[k1 * 2]    + d;
 | |
| ;                               f1      = fi[0 ]        - a;
 | |
| ;                               g1      = gi[0 ]        - b;
 | |
| ;                               f3      = fi[k1 * 2]    - c;
 | |
| ;                               g3      = gi[k1 * 2]    - d;
 | |
| 	lea	edi,[esi + eax*2 - 8]	; edi = gi = fz +k1-i
 | |
| 
 | |
| 	movss	xmm0,[esi + eax*2]	; = fi[k1]
 | |
| 	movss	xmm2,[esi + edx*2]	; = fi[k3]
 | |
| 	shufps	xmm0,xmm2,0x00	; = {fi[k3], fi[k3], fi[k1], fi[k1]}
 | |
| 	movss	xmm1,[edi + eax*2]	; = fi[k1]
 | |
| 	movss	xmm3,[edi + edx*2]	; = fi[k3]
 | |
| 	shufps	xmm1,xmm3,0x00	; = {gi[k3], gi[k3], gi[k1], gi[k1]}
 | |
| 	movss	xmm2,[esi]		; = fi[0]
 | |
| 	mulps	xmm0,xmm4		; *= {+s2, +c2, +s2, +c2}
 | |
| 	movss	xmm3,[esi + eax*4]	; = fi[k2]
 | |
| 	unpcklps	xmm2,xmm3	; = {--, --, fi[k2], fi[0]}
 | |
| 	mulps	xmm1,xmm5		; *= {-c2, +s2, -c2, +s2}
 | |
| 	movss	xmm3,[edi + eax*4]	; = gi[k2]
 | |
| 	addps	xmm0,xmm1		; = {d, c, b, a}
 | |
| 	movss	xmm1,[edi]		; = gi[0]
 | |
| 	unpcklps	xmm1,xmm3	; = {--,  --, gi[k2], gi[0]}
 | |
| 	unpcklps	xmm2,xmm1	; = {gi[k2], fi[k2], gi[0], fi[0]}
 | |
| 	movaps	xmm1,xmm2
 | |
| 	addps	xmm1,xmm0	; = {g2, f2, g0, f0}
 | |
| 	subps	xmm2,xmm0	; = {g3, f3, g1, f1}
 | |
| 
 | |
| ;                               a       = c1*f2     + s1*g3;
 | |
| ;                               c       = s1*g2     + c1*f3;
 | |
| ;                               b       = s1*f2     - c1*g3;
 | |
| ;                               d       = c1*g2     - s1*f3;
 | |
| ;                               fi[0 ]  = f0        + a;
 | |
| ;                               gi[0 ]  = g0        + c;
 | |
| ;                               gi[k1]  = g1        + b;
 | |
| ;                               fi[k1]  = f1        + d;
 | |
| ;                               fi[k1 * 2]  = f0    - a;
 | |
| ;                               gi[k1 * 2]  = g0    - c;
 | |
| ;                               gi[k3]      = g1    - b;
 | |
| ;                               fi[k3]      = f1    - d;
 | |
| 	movaps	xmm3,xmm1
 | |
| 	movhlps	xmm1,xmm1	; = {g2, f2, g2, f2}
 | |
| 	shufps	xmm3,xmm2,0x14	; = {f1, g1, g0, f0}
 | |
| 	mulps	xmm1,xmm6	; *= {+c1, +s1, +s1, +c1}
 | |
| 	shufps	xmm2,xmm2,0xBB	; = {f3, g3, f3, g3}
 | |
| 	mulps	xmm2,xmm7	; *= {-s1, -c1, +c1, +s1}
 | |
| 	addps	xmm1,xmm2	; = {d, b, c, a}
 | |
| 	movaps	xmm2,xmm3
 | |
| 	addps	xmm3,xmm1	; = {fi[k1], gi[k1], gi[0], fi[0]}
 | |
| 	subps	xmm2,xmm1	; = {fi[k3], gi[k3], gi[k1*2], fi[k1*2]}
 | |
| 	movhlps	xmm0,xmm3
 | |
| 	movss	[esi],xmm3
 | |
| 	shufps	xmm3,xmm3,0x55
 | |
| 	movss	[edi+eax*2],xmm0
 | |
| 	shufps	xmm0,xmm0,0x55
 | |
| 	movss	[edi],xmm3
 | |
| 	movss	[esi+eax*2],xmm0
 | |
| 	movhlps	xmm0,xmm2
 | |
| 	movss	[esi+eax*4],xmm2
 | |
| 	shufps	xmm2,xmm2,0x55
 | |
| 	movss	[edi+edx*2],xmm0
 | |
| 	shufps	xmm0,xmm0,0x55
 | |
| 	movss	[edi+eax*4],xmm2
 | |
| 	movss	[esi+edx*2],xmm0
 | |
| 	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
 | |
| 	cmp	esi,[esp]
 | |
| 	jl	near .lp21		; while (fi<fn);
 | |
| 
 | |
| 
 | |
| ; unroll前のdo loopは43+4命令
 | |
| 
 | |
| ; 最内周ではないforループのi=2から先をunrollingした
 | |
| ; kx=   2,   8,  32,  128
 | |
| ; k4=  16,  64, 256, 1024
 | |
| ;       0, 6/2,30/2,126/2
 | |
| 
 | |
| 	xor	ebx,ebx
 | |
| 	mov	bl, 4*2		; = i = 4
 | |
| 	cmp	ebx,eax		; i < k1
 | |
| 	jnl	near .F22
 | |
| ;               for (i=2;i<kx;i+=2){
 | |
| 	loopalign	16
 | |
| .lp22:
 | |
| ; at here, xmm6 is {c3, s3, s3, c3}
 | |
| ;                       c1 = c3*t_c - s3*t_s;
 | |
| ;                       s1 = c3*t_s + s3*t_c;
 | |
| 	movlps	xmm0,[ecx]
 | |
| 	shufps	xmm0,xmm0,R4(1,1,0,0)	; = {t_s, t_s, t_c, t_c}
 | |
| 	mulps	xmm6,xmm0	; = {c3*ts, s3*ts, s3*tc, c3*tc}
 | |
| 	movhlps	xmm4,xmm6	; = {--,    --,    c3*ts, s3*ts}
 | |
| 	xorps	xmm4,[PIC_EBP_REL(Q_MPMP)]	; = {--,    --,   -c3*ts, s3*ts}
 | |
| 	subps	xmm6,xmm4	; = {-,-, c3*ts+s3*tc, c3*tc-s3*ts}={-,-,s1,c1}
 | |
| 
 | |
| ;                       c3 = c1*t_c - s1*t_s;
 | |
| ;                       s3 = s1*t_c + c1*t_s;
 | |
| 	shufps	xmm6,xmm6,0x14	; = {c1, s1, s1, c1}
 | |
| 	mulps	xmm0,xmm6	; = {ts*c1 ts*s1 tc*s1 tc*c1}
 | |
| 	movhlps	xmm3,xmm0
 | |
| 	xorps	xmm3,[PIC_EBP_REL(Q_MPMP)]
 | |
| 	subps	xmm0,xmm3	; = {--, --, s3, c3}
 | |
| 
 | |
| ; {s2 s4 c4 c2} = {2*s1*c1 2*s3*c3 1-2*s3*s3 1-2*s1*s1}
 | |
| 	unpcklps	xmm6,xmm0	; xmm6 = {s3, s1, c3, c1}
 | |
| 	movaps	xmm7, xmm6
 | |
| 	shufps	xmm6,xmm6,R4(2,3,1,0)	; xmm6 = {s1, s3, c3, c1}
 | |
| 	addps	xmm7, xmm7		; {s3*2, s1*2,   --,   --}
 | |
| 	mov	edi,[esp+_P+4]		; = fz
 | |
| 	shufps	xmm7, xmm7, R4(2,3,3,2)	; {s1*2, s3*2, s3*2, s1*2}
 | |
| 	sub	edi,ebx			; edi = fz - i/2
 | |
| 	mulps	xmm7, xmm6		; {s1*s1*2, s3*s3*2, s3*c3*2, s1*c1*2}
 | |
| 	lea	esi,[edi + ebx*2]	; esi = fi = fz +i/2
 | |
| 	subps	xmm7, [PIC_EBP_REL(D_1100)]		; {-c2, -c4, s4, s2}
 | |
| 	lea	edi,[edi + eax*2-4]	; edi = gi = fz +k1-i/2
 | |
| 
 | |
| ;                       fi = fz +i;
 | |
| ;                       gi = fz +k1-i;
 | |
| ;                       do{
 | |
| .lp220:
 | |
| ; unroll後のdo loopは51+4命令
 | |
| ;                               a       = c2*fi[k1  ] + s2*gi[k1  ];
 | |
| ;                               e       = c4*fi[k1+1] + s4*gi[k1-1];
 | |
| ;                               f       = s4*fi[k1+1] - c4*gi[k1-1];
 | |
| ;                               b       = s2*fi[k1  ] - c2*gi[k1  ];
 | |
| ;                               c       = c2*fi[k3  ] + s2*gi[k3  ];
 | |
| ;                               g       = c4*fi[k3+1] + s4*gi[k3-1];
 | |
| ;                               h       = s4*fi[k3+1] - c4*gi[k3-1];
 | |
| ;                               d       = s2*fi[k3  ] - c2*gi[k3  ];
 | |
| 
 | |
| 	movaps	xmm4,xmm7	; = {-c2 -c4  s4  s2}
 | |
| 	xorps	xmm4,[PIC_EBP_REL(Q_MMPP)]	; = { c2  c4  s4  s2}
 | |
| 	shufps	xmm4,xmm4,0x1B	; = { s2  s4  c4  c2}
 | |
| 	movlps	xmm0,[esi+eax*2]
 | |
| 	movlps	xmm1,[edi+eax*2]
 | |
| 	movlps	xmm2,[esi+edx*2]
 | |
| 	movlps	xmm3,[edi+edx*2]
 | |
| 	shufps	xmm0,xmm0,0x14
 | |
| 	shufps	xmm1,xmm1,0x41
 | |
| 	shufps	xmm2,xmm2,0x14
 | |
| 	shufps	xmm3,xmm3,0x41
 | |
| 	mulps	xmm0,xmm4
 | |
| 	mulps	xmm1,xmm7
 | |
| 	mulps	xmm2,xmm4
 | |
| 	mulps	xmm3,xmm7
 | |
| 	addps	xmm0,xmm1	; xmm0 = {b, f, e, a}
 | |
| 	addps	xmm2,xmm3	; xmm2 = {d, h, g, c}
 | |
| ;17
 | |
| 
 | |
| ;                               f0      = fi[0   ]    + a;
 | |
| ;                               f4      = fi[0 +1]    + e;
 | |
| ;                               g4      = gi[0 -1]    + f;
 | |
| ;                               g0      = gi[0   ]    + b;
 | |
| ;                               f1      = fi[0   ]    - a;
 | |
| ;                               f5      = fi[0 +1]    - e;
 | |
| ;                               g5      = gi[0 -1]    - f;
 | |
| ;                               g1      = gi[0   ]    - b;
 | |
| ;                               f2      = fi[k2  ]    + c;
 | |
| ;                               f6      = fi[k2+1]    + g;
 | |
| ;                               g6      = gi[k2-1]    + h;
 | |
| ;                               g2      = gi[k2  ]    + d;
 | |
| ;                               f3      = fi[k2  ]    - c;
 | |
| ;                               f7      = fi[k2+1]    - g;
 | |
| ;                               g7      = gi[k2-1]    - h;
 | |
| ;                               g3      = gi[k2  ]    - d;
 | |
| 	movlps	xmm1,[esi      ]
 | |
| 	movhps	xmm1,[edi      ]
 | |
| 	movaps	xmm4,xmm1
 | |
| 	subps	xmm1,xmm0	; xmm1 = {g1, g5, f5, f1}
 | |
| 	movlps	xmm3,[esi+eax*4]
 | |
| 	movhps	xmm3,[edi+eax*4]
 | |
| 	movaps	xmm5,xmm3
 | |
| 	subps	xmm3,xmm2	; xmm3 = {g3, g7, f7, f3}
 | |
| 	addps	xmm0,xmm4	; xmm0 = {g0, g4, f4, f0}
 | |
| 	addps	xmm2,xmm5	; xmm2 = {g2, g6, f6, f2}
 | |
| ;10
 | |
| 
 | |
| ;                               a       = c1*f2     + s1*g3;	順*順 + 逆*逆
 | |
| ;                               e       = c3*f6     + s3*g7;
 | |
| ;                               g       = s3*g6     + c3*f7;
 | |
| ;                               c       = s1*g2     + c1*f3;
 | |
| ;                               d       = c1*g2     - s1*f3;	順*逆 - 逆*順
 | |
| ;                               h       = c3*g6     - s3*f7;
 | |
| ;                               f       = s3*f6     - c3*g7;
 | |
| ;                               b       = s1*f2     - c1*g3;
 | |
| 
 | |
| 	movaps	xmm5,xmm6	; xmm6 = {s1, s3, c3, c1}
 | |
| 	shufps	xmm5,xmm5,0x1B	; = {c1, c3, s3, s1}
 | |
| 	movaps	xmm4,xmm2
 | |
| 	mulps	xmm4,xmm6
 | |
| 	shufps	xmm2,xmm2,0x1B	; xmm2 = {f2, f6, g6, g2}
 | |
| 	mulps	xmm2,xmm6
 | |
| 	mulps	xmm5,xmm3
 | |
| 	mulps	xmm3,xmm6
 | |
| 	shufps	xmm3,xmm3,0x1B
 | |
| 	addps	xmm4,xmm3	; = {c, g, e, a}
 | |
| 	subps	xmm2,xmm5	; = {b, f, h, d}
 | |
| ;10
 | |
| 
 | |
| ;                               fi[0   ]  = f0        + a;
 | |
| ;                               fi[0 +1]  = f4        + e;
 | |
| ;                               gi[0 -1]  = g4        + g;
 | |
| ;                               gi[0   ]  = g0        + c;
 | |
| ;                               fi[k2  ]  = f0        - a;
 | |
| ;                               fi[k2+1]  = f4        - e;
 | |
| ;                               gi[k2-1]  = g4        - g;
 | |
| ;                               gi[k2  ]  = g0        - c;
 | |
| ;                               fi[k1  ]  = f1        + d;
 | |
| ;                               fi[k1+1]  = f5        + h;
 | |
| ;                               gi[k1-1]  = g5        + f;
 | |
| ;                               gi[k1  ]  = g1        + b;
 | |
| ;                               fi[k3  ]  = f1        - d;
 | |
| ;                               fi[k3+1]  = f5        - h;
 | |
| ;                               gi[k3-1]  = g5        - f;
 | |
| ;                               gi[k3  ]  = g1        - b;
 | |
| 	movaps	xmm3,xmm0
 | |
| 	subps	xmm0,xmm4
 | |
| 	movlps	[esi+eax*4],xmm0
 | |
| 	movhps	[edi+eax*4],xmm0
 | |
| 	addps	xmm4,xmm3
 | |
| 	movlps	[esi      ],xmm4
 | |
| 	movhps	[edi      ],xmm4
 | |
| 
 | |
| 	movaps	xmm5,xmm1
 | |
| 	subps	xmm1,xmm2
 | |
| 	movlps	[esi+edx*2],xmm1
 | |
| 	movhps	[edi+edx*2],xmm1
 | |
| 	addps	xmm2,xmm5
 | |
| 	movlps	[esi+eax*2],xmm2
 | |
| 	movhps	[edi+eax*2],xmm2
 | |
| ; 14
 | |
| ;                               gi     += k4;
 | |
| ;                               fi     += k4;
 | |
| 	lea	edi,[edi + eax*8] ; gi += (k1 * 4);
 | |
| 	lea	esi,[esi + eax*8] ; fi += (k1 * 4);
 | |
| 	cmp	esi,[esp]
 | |
| 	jl	near .lp220		; while (fi<fn);
 | |
| ;                       } while (fi<fn);
 | |
| 
 | |
| 	add	ebx,byte 2*4	; i+= 4
 | |
| 	cmp	ebx,eax		; i < k1
 | |
| 	shufps	xmm6,xmm6,R4(1,2,2,1)	; (--,s3,c3,--) => {c3, s3, s3, c3}
 | |
| 	jl	near .lp22
 | |
| ;               }
 | |
| .F22:
 | |
| 	shl	eax,2
 | |
| 	add	ecx, byte 8
 | |
| 	cmp	eax,[esp+_P+8]	; while ((k1 * 4)<n);
 | |
| 	jle	near .lp2
 | |
| 	pop	ebp
 | |
| 	pop	ebp
 | |
| 	pop	edi
 | |
| 	pop	esi
 | |
| 	pop	ebx
 | |
| 	ret
 | |
| 
 | |
| 	end
 |