mirror of
https://github.com/cookiengineer/audacity
synced 2025-05-02 16:49:41 +02:00
Using LAME 3.10 Windows project files substantially changed from original, and included into audacity solution.
489 lines
9.4 KiB
Plaintext
489 lines
9.4 KiB
Plaintext
; from a new GOGO-no-coda (1999/09)
|
|
; Copyright (C) 1999 shigeo
|
|
; special thanks to Keiichi SAKAI, URURI
|
|
; hacked and back-ported to LAME
|
|
; by Takehiro TOMINAGA Nov 2000
|
|
|
|
%include "nasm.h"
|
|
|
|
globaldef fht_3DN
|
|
|
|
segment_data
|
|
align 16
|
|
costab dd 0x80000000, 0
|
|
dd 1.414213562,1.414213562
|
|
dd 9.238795283293805e-01, 9.238795283293805e-01
|
|
dd 3.826834424611044e-01, 3.826834424611044e-01
|
|
dd 9.951847264044178e-01, 9.951847264044178e-01
|
|
dd 9.801714304836734e-02, 9.801714304836734e-02
|
|
dd 9.996988186794428e-01, 9.996988186794428e-01
|
|
dd 2.454122920569705e-02, 2.454122920569705e-02
|
|
dd 9.999811752815535e-01, 9.999811752815535e-01
|
|
dd 6.135884819898878e-03, 6.135884819898878e-03
|
|
D_1_0_0_0 dd 0.0 , 1.0
|
|
|
|
segment_code
|
|
|
|
PIC_OFFSETTABLE
|
|
|
|
|
|
;void fht_3DN(float *fz, int nn);
|
|
|
|
proc fht_3DN
|
|
|
|
pushd ebp, ebx, esi, edi
|
|
|
|
sub esp, 20
|
|
|
|
call get_pc.bp
|
|
add ebp, PIC_BASE()
|
|
|
|
mov r0, [esp+40] ;fi
|
|
mov r1, [esp+44] ;r1 = nn
|
|
lea r3, [PIC_EBP_REL(costab)] ;tri = costab
|
|
lea r4, [r0+r1*8] ;r4 = fn = &fz[n]
|
|
mov [esp+16], r4
|
|
mov r4, 8 ;kx = k1/2
|
|
|
|
pmov mm7, [r3]
|
|
|
|
loopalign 16
|
|
.do1
|
|
lea r3, [r3+16] ;tri += 2;
|
|
pmov mm6, [PIC_EBP_REL(costab+8)]
|
|
lea r2, [r4+r4*2] ;k3*fsize/2
|
|
mov r5, 4 ;i = 1*fsize
|
|
|
|
loopalign 16
|
|
.do2:
|
|
lea r1, [r0+r4] ;gi = fi + kx
|
|
;f
|
|
pmov mm0, [r0] ;fi0
|
|
pmov mm1, [r0+r4*2] ;fi1
|
|
pmov mm2, [r0+r2*2] ;fi3
|
|
pmov mm3, [r0+r4*4] ;fi2
|
|
|
|
pupldq mm0, mm0 ;fi0 | fi0
|
|
pupldq mm1, mm1 ;fi1 | fi1
|
|
pupldq mm2, mm2 ;fi2 | fi2
|
|
pupldq mm3, mm3 ;fi3 | fi3
|
|
|
|
pxor mm1, mm7 ;fi1 | -fi1
|
|
pxor mm3, mm7 ;fi3 | -fi3
|
|
|
|
pfsub mm0, mm1 ;f1 | f0
|
|
pfsub mm2, mm3 ;f3 | f2
|
|
|
|
pmov mm4, mm0
|
|
pfadd mm0, mm2 ;f1+f3|f0+f2 = fi1 | fi0
|
|
pfsub mm4, mm2 ;f1-f3|f0-f2 = fi3 | fi2
|
|
|
|
pmovd [r0], mm0 ;fi[0]
|
|
puphdq mm0, mm0
|
|
pmovd [r0+r4*4], mm4 ;fi[k2]
|
|
puphdq mm4, mm4
|
|
|
|
pmovd [r0+r4*2], mm4 ;fi[k1]
|
|
pmovd [r0+r2*2], mm0 ;fi[k3]
|
|
lea r0, [r0+r4*8]
|
|
|
|
;g
|
|
pmov mm0, [r1] ;gi0
|
|
pmov mm1, [r1+r4*2] ;gi1
|
|
pmov mm2, [r1+r4*4] ;gi2
|
|
pmov mm3, [r1+r2*2] ;gi3
|
|
|
|
pupldq mm1, mm1
|
|
pupldq mm0, mm0 ;gi0 | gi0
|
|
pupldq mm2, mm3 ;gi3 | gi2
|
|
|
|
pxor mm1, mm7 ;gi1 | -gi1
|
|
|
|
pfsub mm0, mm1 ;gi0-gi1|gi0+gi1 = g1 | g0
|
|
pfmul mm2, mm6 ;gi3*SQRT2|gi2*SQRT2 = g3 | g2
|
|
|
|
pmov mm4, mm0
|
|
pfadd mm0, mm2 ;g1+g3|g0+g2 = gi1 | gi0
|
|
pfsub mm4, mm2 ;g1-g3|g0-g2 = gi3 | gi2
|
|
|
|
pmovd [r1], mm0 ;gi[0]
|
|
puphdq mm0, mm0
|
|
pmovd [r1+r4*4], mm4 ;gi[k2]
|
|
puphdq mm4, mm4
|
|
|
|
cmp r0, [esp + 16]
|
|
pmovd [r1+r4*2], mm0 ;gi[k1]
|
|
pmovd [r1+r2*2], mm4 ;gi[k3]
|
|
|
|
jb near .do2
|
|
|
|
pmov mm6, [r3+r5] ; this is not aligned address!!
|
|
|
|
loopalign 16
|
|
.for:
|
|
;
|
|
; mm6 = c1 | s1
|
|
; mm7 = 0x800000000 | 0
|
|
;
|
|
pmov mm1, mm6
|
|
mov r0, [esp+40] ; fz
|
|
puphdq mm1, mm1 ; c1 | c1
|
|
lea r1, [r0+r4*2]
|
|
pfadd mm1, mm1 ; c1+c1 | c1+c1
|
|
pfmul mm1, mm6 ; 2*c1*c1 | 2*c1*s1
|
|
pfsub mm1, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
|
|
|
|
pmov mm0, mm1
|
|
pxor mm7, mm6 ; c1 | -s1
|
|
|
|
pupldq mm2, mm0
|
|
pupldq mm3, mm6 ; ** | c1
|
|
puphdq mm0, mm2 ; s2 | c2
|
|
puphdq mm6, mm3 ;-s1 | c1
|
|
|
|
pxor mm0, [PIC_EBP_REL(costab)] ; c2 | -s2
|
|
|
|
; mm0 = s2| c2
|
|
; mm1 = -c2| s2
|
|
; mm6 = c1| s1
|
|
; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
|
|
|
|
pmov [esp], mm0
|
|
pmov [esp+8], mm1
|
|
|
|
sub r1, r5 ;r1 = gi
|
|
add r0, r5 ;r0 = fi
|
|
|
|
loopalign 16
|
|
.do3:
|
|
pmov mm2, [r0+r4*2] ; fi[k1]
|
|
pmov mm4, [r1+r4*2] ; gi[k1]
|
|
pmov mm3, [r0+r2*2] ; fi[k3]
|
|
pmov mm5, [r1+r2*2] ; gi[k3]
|
|
|
|
pupldq mm2, mm2 ; fi1 | fi1
|
|
pupldq mm4, mm4 ; gi1 | gi1
|
|
pupldq mm3, mm3 ; fi3 | fi3
|
|
pupldq mm5, mm5 ; gi3 | gi3
|
|
|
|
pfmul mm2, mm0 ; s2 * fi1 | c2 * fi1
|
|
pfmul mm4, mm1 ;-c2 * gi1 | s2 * gi1
|
|
pfmul mm3, mm0 ; s2 * fi3 | c2 * fi3
|
|
pfmul mm5, mm1 ;-c2 * gi3 | s2 * gi3
|
|
|
|
pfadd mm2, mm4 ;b | a
|
|
pfadd mm3, mm5 ;d | c
|
|
|
|
pmov mm0, [r0]
|
|
pmov mm4, [r1]
|
|
pmov mm1, [r0+r4*4]
|
|
pmov mm5, [r1+r4*4]
|
|
|
|
pupldq mm0, mm4 ;gi0 | fi0
|
|
pupldq mm1, mm5 ;gi2 | fi2
|
|
|
|
pmov mm4, mm2
|
|
pmov mm5, mm3
|
|
|
|
pfadd mm2, mm0 ;g0 | f0
|
|
pfadd mm3, mm1 ;g2 | f2
|
|
|
|
pfsub mm0, mm4 ;g1 | f1
|
|
pfsub mm1, mm5 ;g3 | f3
|
|
|
|
pmov mm4, mm3
|
|
pmov mm5, mm1
|
|
|
|
pupldq mm4, mm4 ;f2 | f2
|
|
puphdq mm5, mm5 ;g3 | g3
|
|
puphdq mm3, mm3 ;g2 | g2
|
|
pupldq mm1, mm1 ;f3 | f3
|
|
|
|
pfmul mm4, mm6 ;f2 * c1 | f2 * s1
|
|
pfmul mm5, mm7 ;g3 * s1 | g3 *-c1
|
|
pfmul mm3, mm6 ;g2 * c1 | g2 * s1
|
|
pfmul mm1, mm7 ;f3 * s1 | f3 *-c1
|
|
|
|
pfadd mm4, mm5 ;a | b
|
|
pfsub mm3, mm1 ;d | c
|
|
|
|
pmov mm5, mm2
|
|
pmov mm1, mm0
|
|
|
|
pupldq mm2, mm2 ;f0 | f0
|
|
pupldq mm0, mm0 ;f1 | f1
|
|
|
|
puphdq mm1, mm2 ;f0 | g1
|
|
puphdq mm5, mm0 ;f1 | g0
|
|
|
|
pmov mm2, mm4
|
|
pmov mm0, mm3
|
|
|
|
pfadd mm4, mm1 ;fi0 | gi1
|
|
pfadd mm3, mm5 ;fi1 | gi0
|
|
pfsub mm1, mm2 ;fi2 | gi3
|
|
pfsub mm5, mm0 ;fi3 | gi2
|
|
|
|
pmovd [r1+r4*2], mm4 ;gi[k1]
|
|
puphdq mm4, mm4
|
|
pmovd [r1], mm3 ;gi[0]
|
|
puphdq mm3, mm3
|
|
pmovd [r1+r2*2], mm1 ;gi[k3]
|
|
puphdq mm1, mm1
|
|
pmovd [r1+r4*4], mm5 ;gi[k2]
|
|
puphdq mm5, mm5
|
|
|
|
pmovd [r0], mm4 ;fi[0]
|
|
pmovd [r0+r4*2], mm3 ;fi[k1]
|
|
pmovd [r0+r4*4], mm1 ;fi[k2]
|
|
pmovd [r0+r2*2], mm5 ;fi[k3]
|
|
|
|
lea r0, [r0+r4*8]
|
|
lea r1, [r1+r4*8]
|
|
cmp r0, [esp + 16]
|
|
pmov mm0, [esp]
|
|
pmov mm1, [esp+8]
|
|
|
|
jb near .do3
|
|
|
|
add r5, 4
|
|
; mm6 = c1| s1
|
|
; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
|
|
pfmul mm6, [r3] ; c1*a | s1*a
|
|
pfmul mm7, [r3+8] ; s1*b |-c1*b
|
|
cmp r5, r4
|
|
|
|
pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b
|
|
pupldq mm7,mm6
|
|
puphdq mm6,mm7
|
|
pmov mm7, [PIC_EBP_REL(costab)]
|
|
jb near .for
|
|
|
|
mov r0, [esp+40] ;fi
|
|
cmp r4, [esp+40+4]
|
|
lea r4, [r4*4] ;kx *= 4
|
|
|
|
jb near .do1
|
|
.exitttt
|
|
femms
|
|
add esp,20
|
|
popd ebp, ebx, esi, edi
|
|
endproc
|
|
|
|
|
|
;void fht_E3DN(float *fz, int nn);
|
|
|
|
proc fht_E3DN
|
|
|
|
pushd ebp, ebx, esi, edi
|
|
|
|
sub esp, 20
|
|
|
|
call get_pc.bp
|
|
add ebp, PIC_BASE()
|
|
|
|
mov r0, [esp+40] ;fi
|
|
mov r1, [esp+44] ;r1 = nn
|
|
lea r3, [PIC_EBP_REL(costab)] ;tri = costab
|
|
lea r4, [r0+r1*8] ;r4 = fn = &fz[n]
|
|
mov [esp+16], r4
|
|
mov r4, 8 ;kx = k1/2
|
|
|
|
pmov mm7, [r3]
|
|
|
|
loopalign 16
|
|
.do1
|
|
lea r3, [r3+16] ;tri += 2;
|
|
pmov mm6, [PIC_EBP_REL(costab+8)]
|
|
lea r2, [r4+r4*2] ;k3*fsize/2
|
|
mov r5, 4 ;i = 1*fsize
|
|
|
|
loopalign 16
|
|
.do2:
|
|
lea r1, [r0+r4] ;gi = fi + kx
|
|
;f
|
|
pmov mm0, [r0] ; X | fi0
|
|
pmov mm1, [r0+r4*4] ; X | fi2
|
|
pupldq mm0, [r0+r4*2] ;fi1 | fi0
|
|
pupldq mm1, [r0+r2*2] ;fi3 | fi2
|
|
pfpnacc mm0, mm0 ;fi0+fi1 | fi0-fi1 = f0|f1
|
|
pfpnacc mm1, mm1 ;fi2+fi3 | fi2-fi3 = f2|f3
|
|
|
|
pmov mm2, mm0
|
|
pfadd mm0, mm1 ;f0+f2|f1+f3 = fi0 | fi1
|
|
pfsub mm2, mm1 ;f0-f2|f1-f3 = fi2 | fi3
|
|
|
|
pmovd [r0+r4*2], mm0 ;fi[k1]
|
|
pmovd [r0+r2*2], mm2 ;fi[k3]
|
|
|
|
puphdq mm0, mm0
|
|
puphdq mm2, mm2
|
|
pmovd [r0], mm0 ;fi[0]
|
|
pmovd [r0+r4*4], mm2 ;fi[k2]
|
|
|
|
lea r0, [r0+r4*8]
|
|
;g
|
|
pmov mm3, [r1] ; gi0
|
|
pmov mm4, [r1+r2*2] ; gi3
|
|
pupldq mm3, [r1+r4*2] ;gi1|gi0
|
|
pupldq mm4, [r1+r4*4] ;gi2|gi3
|
|
|
|
pfpnacc mm3, mm3 ;gi0+gi1 |gi0-gi1 = f0|f1
|
|
pfmul mm4, mm6 ;gi2*SQRT2|gi3*SQRT2 = f2|f3
|
|
|
|
pmov mm5, mm3
|
|
pfadd mm3, mm4 ;f0+f2|f1+f3
|
|
pfsub mm5, mm4 ;f0-f2|f1-f3
|
|
|
|
cmp r0, [esp + 16]
|
|
pmovd [r1+r4*2], mm3 ;gi[k1]
|
|
pmovd [r1+r2*2], mm5 ;gi[k3]
|
|
puphdq mm3, mm3
|
|
puphdq mm5, mm5
|
|
pmovd [r1], mm3 ;gi[0]
|
|
pmovd [r1+r4*4], mm5 ;gi[k2]
|
|
|
|
jb near .do2
|
|
|
|
pmov mm6, [r3+r5] ; this is not aligned address!!
|
|
|
|
loopalign 16
|
|
.for:
|
|
;
|
|
; mm6 = c1 | s1
|
|
; mm7 = 0x800000000 | 0
|
|
;
|
|
pmov mm5, mm6
|
|
mov r0, [esp+40] ; fz
|
|
puphdq mm5, mm5 ; c1 | c1
|
|
lea r1, [r0+r4*2]
|
|
pfadd mm5, mm5 ; c1+c1 | c1+c1
|
|
pfmul mm5, mm6 ; 2*c1*c1 | 2*c1*s1
|
|
pfsub mm5, [PIC_EBP_REL(D_1_0_0_0)] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2
|
|
|
|
pswapd mm4, mm5 ; s2 |-c2
|
|
pxor mm4, mm7 ; s2 | c2
|
|
pxor mm7, mm6 ; c1 |-s1
|
|
pswapd mm6, mm6 ; s1 | c1
|
|
|
|
; mm4 = s2| c2
|
|
; mm5 = -c2| s2
|
|
; mm6 = c1| s1
|
|
; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
|
|
|
|
pmov [esp], mm4
|
|
pmov [esp+8], mm5
|
|
|
|
sub r1, r5 ;r1 = gi
|
|
add r0, r5 ;r0 = fi
|
|
|
|
loopalign 16
|
|
.do3:
|
|
pmov mm0, [r0+r2*2] ; fi[k1]
|
|
pmov mm2, [r1+r2*2] ; gi[k1]
|
|
pmov mm1, [r0+r4*2] ; fi[k3]
|
|
pmov mm3, [r1+r4*2] ; gi[k3]
|
|
|
|
pupldq mm0, mm0
|
|
pupldq mm2, mm2
|
|
pupldq mm1, mm1
|
|
pupldq mm3, mm3
|
|
|
|
pfmul mm0, mm4
|
|
pfmul mm2, mm5
|
|
pfmul mm1, mm4
|
|
pfmul mm3, mm5
|
|
|
|
pfadd mm0, mm2 ;d | c
|
|
pfadd mm1, mm3 ;b | a
|
|
|
|
pmov mm2, [r0+r4*4] ;fi2
|
|
pupldq mm3, [r1+r4*4] ;gi2 | -
|
|
pmov mm4, [r0] ;fi0
|
|
pupldq mm5, [r1] ;gi0 | -
|
|
|
|
pupldq mm2, mm0 ;c | fi2
|
|
puphdq mm3, mm0 ;d | gi2
|
|
pupldq mm4, mm1 ;a | fi0
|
|
puphdq mm5, mm1 ;b | gi0
|
|
|
|
pfpnacc mm2, mm2 ;f2 | f3
|
|
pfpnacc mm3, mm3 ;g2 | g3
|
|
pfpnacc mm4, mm4 ;f0 | f1
|
|
pfpnacc mm5, mm5 ;g0 | g1
|
|
|
|
pmov mm0, mm2
|
|
pmov mm1, mm3
|
|
pupldq mm2, mm2 ;f3 | f3
|
|
pupldq mm3, mm3 ;g3 | g3
|
|
puphdq mm0, mm0 ;f2 | f2
|
|
puphdq mm1, mm1 ;g2 | g2
|
|
|
|
pswapd mm4, mm4 ;f1 | f0
|
|
pswapd mm5, mm5 ;g1 | g0
|
|
|
|
pfmul mm0, mm7 ;f2 * s1 | f2 *-c1
|
|
pfmul mm3, mm6 ;g3 * c1 | g3 * s1
|
|
pfmul mm1, mm6 ;g2 * c1 | g2 * s1
|
|
pfmul mm2, mm7 ;f3 * s1 | f3 *-c1
|
|
|
|
pfsub mm0, mm3 ; b |-a
|
|
pfsub mm1, mm2 ; d | c
|
|
|
|
pmov mm2, mm5
|
|
pmov mm3, mm4
|
|
pupldq mm4, mm0 ;-a | f0
|
|
pupldq mm5, mm1 ; c | g0
|
|
puphdq mm2, mm0 ; b | g1
|
|
puphdq mm3, mm1 ; d | f1
|
|
|
|
pfpnacc mm4, mm4 ;fi2 | fi0
|
|
pfpnacc mm5, mm5 ;gi0 | gi2
|
|
pfpnacc mm2, mm2 ;gi1 | gi3
|
|
pfpnacc mm3, mm3 ;fi1 | fi3
|
|
|
|
pmovd [r0], mm4 ;fi[0]
|
|
pmovd [r1+r4*4], mm5 ;gi[k2]
|
|
pmovd [r1+r2*2], mm2 ;gi[k3]
|
|
pmovd [r0+r2*2], mm3 ;fi[k3]
|
|
|
|
puphdq mm4, mm4
|
|
puphdq mm5, mm5
|
|
puphdq mm2, mm2
|
|
puphdq mm3, mm3
|
|
pmovd [r0+r4*4], mm4 ;fi[k2]
|
|
pmovd [r1], mm5 ;gi[0]
|
|
pmovd [r1+r4*2], mm2 ;gi[k1]
|
|
pmovd [r0+r4*2], mm3 ;fi[k1]
|
|
|
|
lea r0, [r0+r4*8]
|
|
lea r1, [r1+r4*8]
|
|
cmp r0, [esp + 16]
|
|
pmov mm4, [esp]
|
|
pmov mm5, [esp+8]
|
|
|
|
jb near .do3
|
|
|
|
add r5, 4
|
|
; mm6 = c1| s1
|
|
; mm7 = s1|-c1 (we use the opposite sign. from GOGO here)
|
|
pfmul mm6, [r3] ; c1*a | s1*a
|
|
pfmul mm7, [r3+8] ; s1*b |-c1*b
|
|
cmp r5, r4
|
|
|
|
pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b
|
|
pswapd mm6, mm6 ; ??? ; s1*a+c1*b | c1*a-s1*b
|
|
pmov mm7, [PIC_EBP_REL(costab)]
|
|
jb near .for
|
|
|
|
mov r0, [esp+40] ;fi
|
|
cmp r4, [esp+40+4]
|
|
lea r4, [r4*4] ;kx *= 4
|
|
|
|
jb near .do1
|
|
.exitttt
|
|
femms
|
|
add esp,20
|
|
popd ebp, ebx, esi, edi
|
|
endproc
|