1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-05-01 08:09:41 +02:00
James Crook 5e0efd1a25 Start on built-in LAME
Using LAME 3.10
Windows project files substantially changed from original, and included into audacity solution.
2019-03-26 17:46:53 +00:00

1023 lines
25 KiB
Plaintext

;
; (C) Frank Klemm 1995,99,2000
; Dedicated to the LAME project
;
;
%include "nasm.h"
segment_code
; float_t scalar04_float32_i387 (
; const float32_t* const p,
; const float32_t* const q );
proc scalar04_float32_i387
%$p arg 4
%$q arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
endproc
proc scalar08_float32_i387
%$p arg 4
%$q arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
fld dword [eax + 16]
fmul dword [edx + 16]
faddp st1,st0
fld dword [eax + 20]
fmul dword [edx + 20]
faddp st1,st0
fld dword [eax + 24]
fmul dword [edx + 24]
faddp st1,st0
fld dword [eax + 28]
fmul dword [edx + 28]
faddp st1,st0
endproc
proc scalar12_float32_i387
%$p arg 4
%$q arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
fld dword [eax + 16]
fmul dword [edx + 16]
faddp st1,st0
fld dword [eax + 20]
fmul dword [edx + 20]
faddp st1,st0
fld dword [eax + 24]
fmul dword [edx + 24]
faddp st1,st0
fld dword [eax + 28]
fmul dword [edx + 28]
faddp st1,st0
fld dword [eax + 32]
fmul dword [edx + 32]
faddp st1,st0
fld dword [eax + 36]
fmul dword [edx + 36]
faddp st1,st0
fld dword [eax + 40]
fmul dword [edx + 40]
faddp st1,st0
fld dword [eax + 44]
fmul dword [edx + 44]
faddp st1,st0
endproc
proc scalar16_float32_i387
%$p arg 4
%$q arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
fld dword [eax + 16]
fmul dword [edx + 16]
faddp st1,st0
fld dword [eax + 20]
fmul dword [edx + 20]
faddp st1,st0
fld dword [eax + 24]
fmul dword [edx + 24]
faddp st1,st0
fld dword [eax + 28]
fmul dword [edx + 28]
faddp st1,st0
fld dword [eax + 32]
fmul dword [edx + 32]
faddp st1,st0
fld dword [eax + 36]
fmul dword [edx + 36]
faddp st1,st0
fld dword [eax + 40]
fmul dword [edx + 40]
faddp st1,st0
fld dword [eax + 44]
fmul dword [edx + 44]
faddp st1,st0
fld dword [eax + 48]
fmul dword [edx + 48]
faddp st1,st0
fld dword [eax + 52]
fmul dword [edx + 52]
faddp st1,st0
fld dword [eax + 56]
fmul dword [edx + 56]
faddp st1,st0
fld dword [eax + 60]
fmul dword [edx + 60]
faddp st1,st0
endproc
proc scalar20_float32_i387
%$p arg 4
%$q arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
fld dword [eax + 16]
fmul dword [edx + 16]
faddp st1,st0
fld dword [eax + 20]
fmul dword [edx + 20]
faddp st1,st0
fld dword [eax + 24]
fmul dword [edx + 24]
faddp st1,st0
fld dword [eax + 28]
fmul dword [edx + 28]
faddp st1,st0
fld dword [eax + 32]
fmul dword [edx + 32]
faddp st1,st0
fld dword [eax + 36]
fmul dword [edx + 36]
faddp st1,st0
fld dword [eax + 40]
fmul dword [edx + 40]
faddp st1,st0
fld dword [eax + 44]
fmul dword [edx + 44]
faddp st1,st0
fld dword [eax + 48]
fmul dword [edx + 48]
faddp st1,st0
fld dword [eax + 52]
fmul dword [edx + 52]
faddp st1,st0
fld dword [eax + 56]
fmul dword [edx + 56]
faddp st1,st0
fld dword [eax + 60]
fmul dword [edx + 60]
faddp st1,st0
fld dword [eax + 64]
fmul dword [edx + 64]
faddp st1,st0
fld dword [eax + 68]
fmul dword [edx + 68]
faddp st1,st0
fld dword [eax + 72]
fmul dword [edx + 72]
faddp st1,st0
fld dword [eax + 76]
fmul dword [edx + 76]
faddp st1,st0
endproc
proc scalar24_float32_i387
%$p arg 4
%$q arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
fld dword [eax + 16]
fmul dword [edx + 16]
faddp st1,st0
fld dword [eax + 20]
fmul dword [edx + 20]
faddp st1,st0
fld dword [eax + 24]
fmul dword [edx + 24]
faddp st1,st0
fld dword [eax + 28]
fmul dword [edx + 28]
faddp st1,st0
fld dword [eax + 32]
fmul dword [edx + 32]
faddp st1,st0
fld dword [eax + 36]
fmul dword [edx + 36]
faddp st1,st0
fld dword [eax + 40]
fmul dword [edx + 40]
faddp st1,st0
fld dword [eax + 44]
fmul dword [edx + 44]
faddp st1,st0
fld dword [eax + 48]
fmul dword [edx + 48]
faddp st1,st0
fld dword [eax + 52]
fmul dword [edx + 52]
faddp st1,st0
fld dword [eax + 56]
fmul dword [edx + 56]
faddp st1,st0
fld dword [eax + 60]
fmul dword [edx + 60]
faddp st1,st0
fld dword [eax + 64]
fmul dword [edx + 64]
faddp st1,st0
fld dword [eax + 68]
fmul dword [edx + 68]
faddp st1,st0
fld dword [eax + 72]
fmul dword [edx + 72]
faddp st1,st0
fld dword [eax + 76]
fmul dword [edx + 76]
faddp st1,st0
fld dword [eax + 80]
fmul dword [edx + 80]
faddp st1,st0
fld dword [eax + 84]
fmul dword [edx + 84]
faddp st1,st0
fld dword [eax + 88]
fmul dword [edx + 88]
faddp st1,st0
fld dword [eax + 92]
fmul dword [edx + 92]
faddp st1,st0
endproc
proc scalar32_float32_i387
%$p arg 4
%$q arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
fld dword [eax + 16]
fmul dword [edx + 16]
faddp st1,st0
fld dword [eax + 20]
fmul dword [edx + 20]
faddp st1,st0
fld dword [eax + 24]
fmul dword [edx + 24]
faddp st1,st0
fld dword [eax + 28]
fmul dword [edx + 28]
faddp st1,st0
fld dword [eax + 32]
fmul dword [edx + 32]
faddp st1,st0
fld dword [eax + 36]
fmul dword [edx + 36]
faddp st1,st0
fld dword [eax + 40]
fmul dword [edx + 40]
faddp st1,st0
fld dword [eax + 44]
fmul dword [edx + 44]
faddp st1,st0
fld dword [eax + 48]
fmul dword [edx + 48]
faddp st1,st0
fld dword [eax + 52]
fmul dword [edx + 52]
faddp st1,st0
fld dword [eax + 56]
fmul dword [edx + 56]
faddp st1,st0
fld dword [eax + 60]
fmul dword [edx + 60]
faddp st1,st0
fld dword [eax + 64]
fmul dword [edx + 64]
faddp st1,st0
fld dword [eax + 68]
fmul dword [edx + 68]
faddp st1,st0
fld dword [eax + 72]
fmul dword [edx + 72]
faddp st1,st0
fld dword [eax + 76]
fmul dword [edx + 76]
faddp st1,st0
fld dword [eax + 80]
fmul dword [edx + 80]
faddp st1,st0
fld dword [eax + 84]
fmul dword [edx + 84]
faddp st1,st0
fld dword [eax + 88]
fmul dword [edx + 88]
faddp st1,st0
fld dword [eax + 92]
fmul dword [edx + 92]
faddp st1,st0
fld dword [eax + 96]
fmul dword [edx + 96]
faddp st1,st0
fld dword [eax +100]
fmul dword [edx +100]
faddp st1,st0
fld dword [eax +104]
fmul dword [edx +104]
faddp st1,st0
fld dword [eax +108]
fmul dword [edx +108]
faddp st1,st0
fld dword [eax +112]
fmul dword [edx +112]
faddp st1,st0
fld dword [eax +116]
fmul dword [edx +116]
faddp st1,st0
fld dword [eax +120]
fmul dword [edx +120]
faddp st1,st0
fld dword [eax +124]
fmul dword [edx +124]
faddp st1,st0
endproc
; float_t scalar4n_float32_i387 (
; const float32_t* const p,
; const float32_t* const q,
; const size_t len );
proc scalar4n_float32_i387
%$p arg 4
%$q arg 4
%$len arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
mov ecx,[sp(%$len)]
fld dword [eax]
fmul dword [edx]
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
dec ecx
jz .ret1
add eax,byte 16
add edx,byte 16
.lbl1
fld dword [eax]
fmul dword [edx]
faddp st1,st0
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
add eax,byte 16
add edx,byte 16
dec ecx
jnz .lbl1
.ret1
endproc
; float_t scalar1n_float32_i387 (
; const float32_t* const p,
; const float32_t* const q,
; const size_t len );
proc scalar1n_float32_i387
%$p arg 4
%$q arg 4
%$len arg 4
;;; alloc
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
mov ecx,[sp(%$len)]
fld0
shr ecx,1
jnc .lbl2
fld dword [eax]
fmul dword [edx]
faddp st1,st0
add eax,byte 4
add edx,byte 4
.lbl2
shr ecx,1
jnc .lbl3
fld dword [eax]
fmul dword [edx]
faddp st1,st0
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
add eax,byte 8
add edx,byte 8
and ecx,ecx
.lbl3
jz .ret2
.lbl4
fld dword [eax]
fmul dword [edx]
faddp st1,st0
fld dword [eax + 4]
fmul dword [edx + 4]
faddp st1,st0
fld dword [eax + 8]
fmul dword [edx + 8]
faddp st1,st0
fld dword [eax + 12]
fmul dword [edx + 12]
faddp st1,st0
add eax,byte 16
add edx,byte 16
dec ecx
jnz .lbl4
.ret2
endproc
proc scalar04_float32_3DNow
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar08_float32_3DNow
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
pmov mm2,qword [eax+16]
pmov mm3,qword [eax+24]
pfmul mm2,qword [edx+16]
pfmul mm3,qword [edx+24]
pfadd mm0,mm2
pfadd mm1,mm3
pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar12_float32_3DNow
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
pmov mm2,qword [eax+16]
pmov mm3,qword [eax+24]
pfmul mm2,qword [edx+16]
pfmul mm3,qword [edx+24]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+32]
pmov mm3,qword [eax+40]
pfmul mm2,qword [edx+32]
pfmul mm3,qword [edx+40]
pfadd mm0,mm2
pfadd mm1,mm3
pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar16_float32_3DNow
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
pmov mm2,qword [eax+16]
pmov mm3,qword [eax+24]
pfmul mm2,qword [edx+16]
pfmul mm3,qword [edx+24]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+32]
pmov mm3,qword [eax+40]
pfmul mm2,qword [edx+32]
pfmul mm3,qword [edx+40]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+48]
pmov mm3,qword [eax+56]
pfmul mm2,qword [edx+48]
pfmul mm3,qword [edx+56]
pfadd mm0,mm2
pfadd mm1,mm3
pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar20_float32_3DNow
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
pmov mm2,qword [eax+16]
pmov mm3,qword [eax+24]
pfmul mm2,qword [edx+16]
pfmul mm3,qword [edx+24]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+32]
pmov mm3,qword [eax+40]
pfmul mm2,qword [edx+32]
pfmul mm3,qword [edx+40]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+48]
pmov mm3,qword [eax+56]
pfmul mm2,qword [edx+48]
pfmul mm3,qword [edx+56]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+64]
pmov mm3,qword [eax+72]
pfmul mm2,qword [edx+64]
pfmul mm3,qword [edx+72]
pfadd mm0,mm2
pfadd mm1,mm3
pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar24_float32_3DNow
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
pmov mm2,qword [eax+16]
pmov mm3,qword [eax+24]
pfmul mm2,qword [edx+16]
pfmul mm3,qword [edx+24]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+32]
pmov mm3,qword [eax+40]
pfmul mm2,qword [edx+32]
pfmul mm3,qword [edx+40]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+48]
pmov mm3,qword [eax+56]
pfmul mm2,qword [edx+48]
pfmul mm3,qword [edx+56]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+64]
pmov mm3,qword [eax+72]
pfmul mm2,qword [edx+64]
pfmul mm3,qword [edx+72]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+80]
pmov mm3,qword [eax+88]
pfmul mm2,qword [edx+80]
pfmul mm3,qword [edx+88]
pfadd mm0,mm2
pfadd mm1,mm3
pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar32_float32_3DNow
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
pmov mm2,qword [eax+16]
pmov mm3,qword [eax+24]
pfmul mm2,qword [edx+16]
pfmul mm3,qword [edx+24]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+32]
pmov mm3,qword [eax+40]
pfmul mm2,qword [edx+32]
pfmul mm3,qword [edx+40]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+48]
pmov mm3,qword [eax+56]
pfmul mm2,qword [edx+48]
pfmul mm3,qword [edx+56]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+64]
pmov mm3,qword [eax+72]
pfmul mm2,qword [edx+64]
pfmul mm3,qword [edx+72]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+80]
pmov mm3,qword [eax+88]
pfmul mm2,qword [edx+80]
pfmul mm3,qword [edx+88]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+96]
pmov mm3,qword [eax+104]
pfmul mm2,qword [edx+96]
pfmul mm3,qword [edx+104]
pfadd mm0,mm2
pfadd mm1,mm3
pmov mm2,qword [eax+112]
pmov mm3,qword [eax+120]
pfmul mm2,qword [edx+112]
pfmul mm3,qword [edx+120]
pfadd mm0,mm2
pfadd mm1,mm3
pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar4n_float32_3DNow
%$p arg 4
%$q arg 4
%$len arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
mov ecx,[sp(%$len)]
pmov mm0,qword [eax]
pmov mm1,qword [eax+8]
pfmul mm0,qword [edx]
pfmul mm1,qword [edx+8]
dec ecx
jz .ret4
add eax,byte 16
add edx,byte 16
.lbl4:
pmov mm2,qword [eax]
pmov mm3,qword [eax+8]
pfmul mm2,qword [edx]
pfmul mm3,qword [edx+8]
add eax,byte 16
add edx,byte 16
pfadd mm0,mm2
pfadd mm1,mm3
dec ecx
jnz .lbl4
.ret4: pfadd mm0,mm1
pmov qword [sp(%$p)],mm0
femms
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar1n_float32_3DNow
jmp scalar24_float32_i387
endproc
proc scalar04_float32_SIMD
jmp scalar04_float32_i387
endproc
proc scalar08_float32_SIMD
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
movups xmm0, [eax]
movups xmm1, [eax+16]
mulps xmm0, [edx]
mulps xmm1, [edx+16]
addps xmm0,xmm1
sub esp,16
movups [esp],xmm0
fld dword [esp+ 0]
fadd dword [esp+ 4]
fadd dword [esp+ 8]
fadd dword [esp+12]
add esp,16
endproc
proc scalar12_float32_SIMD
jmp scalar12_float32_i387
endproc
proc scalar16_float32_SIMD
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
movups xmm0, [eax]
movups xmm1, [eax+16]
mulps xmm0, [edx]
mulps xmm1, [edx+16]
movups xmm2, [eax+32]
movups xmm3, [eax+48]
mulps xmm2, [edx+32]
mulps xmm3, [edx+48]
addps xmm0,xmm2
addps xmm1,xmm3
addps xmm0,xmm1
sub esp,16
movups [esp],xmm0
fld dword [esp+ 0]
fadd dword [esp+ 4]
fadd dword [esp+ 8]
fadd dword [esp+12]
add esp,16
endproc
proc scalar20_float32_SIMD
jmp scalar20_float32_i387
endproc
proc scalar24_float32_SIMD
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
movups xmm0, [eax]
movups xmm1, [eax+16]
mulps xmm0, [edx]
mulps xmm1, [edx+16]
movups xmm2, [eax+32]
movups xmm3, [eax+48]
mulps xmm2, [edx+32]
mulps xmm3, [edx+48]
addps xmm0,xmm2
addps xmm1,xmm3
movups xmm2, [eax+64]
movups xmm3, [eax+80]
mulps xmm2, [edx+64]
mulps xmm3, [edx+80]
addps xmm0,xmm2
addps xmm1,xmm3
addps xmm0,xmm1
sub esp,16
movups [esp],xmm0
fld dword [esp+ 0]
fadd dword [esp+ 4]
fadd dword [esp+ 8]
fadd dword [esp+12]
add esp,16
endproc
proc scalar32_float32_SIMD
%$p arg 4
%$q arg 4
mov eax,[sp(%$p)]
mov edx,[sp(%$q)]
movups xmm0, [eax]
movups xmm1, [eax+16]
mulps xmm0, [edx]
mulps xmm1, [edx+16]
movups xmm2, [eax+32]
movups xmm3, [eax+48]
mulps xmm2, [edx+32]
mulps xmm3, [edx+48]
addps xmm0,xmm2
addps xmm1,xmm3
movups xmm2, [eax+64]
movups xmm3, [eax+80]
mulps xmm2, [edx+64]
mulps xmm3, [edx+80]
addps xmm0,xmm2
addps xmm1,xmm3
movups xmm2, [eax+96]
movups xmm3, [eax+112]
mulps xmm2, [edx+96]
mulps xmm3, [edx+112]
addps xmm0,xmm2
addps xmm1,xmm3
addps xmm0,xmm1
;sub esp,16
;movups [esp],xmm0
;fld dword [esp+ 0]
;fadd dword [esp+ 4]
;fadd dword [esp+ 8]
;fadd dword [esp+12]
;add esp,16
movhlps xmm1,xmm0
addps xmm0,xmm1
movlps [sp(%$p)],xmm0
fld dword [sp(%$p)]
fadd dword [sp(%$p)+4]
endproc
proc scalar4n_float32_SIMD
jmp scalar4n_float32_i387
endproc
proc scalar1n_float32_SIMD
jmp scalar1n_float32_i387
endproc
; end of scalar.nas