/********************************************************************** Audacity: A Digital Audio Editor RealFFT48x.cpp Philip Van Baren Andrew Hallendorff (SSE Mods) *******************************************************************//** \file RealFFT48x.cpp \brief Real FFT with SSE acceleration. *//****************************************************************/ /* * Program: REALFFTF.C * Author: Philip Van Baren * Date: 2 September 1993 * * Description: These routines perform an FFT on real data to get a conjugate-symmetric * output, and an inverse FFT on conjugate-symmetric input to get a real * output sequence. * * This code is for floating point data. * * Modified 8/19/1998 by Philip Van Baren * - made the InitializeFFT and EndFFT routines take a structure * holding the length and pointers to the BitReversed and SinTable * tables. * Modified 5/23/2009 by Philip Van Baren * - Added GetFFT and ReleaseFFT routines to retain common SinTable * and BitReversed tables so they don't need to be reallocated * and recomputed on every call. * - Added Reorder* functions to undo the bit-reversal * Modified 15 April 2016 Paul Licameli * - C++11 smart pointers * * Copyright (C) 2009 Philip VanBaren * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "Audacity.h" // for USE_* macros #include "RealFFTf48x.h" #include "Experimental.h" #ifdef EXPERIMENTAL_EQ_SSE_THREADED // at the moment these two are mutually exclusive //#define USE_SSEMATHFUNC //#define TEST_COSSINBIT_TABLES #ifndef USE_SSE2 #define USE_SSE2 #endif #include #include #include #include "RealFFTf.h" #ifdef __WXMSW__ #pragma warning(disable:4305) #else #endif #include "SseMathFuncs.h" #include #ifndef M_PI #define M_PI 3.14159265358979323846 /* pi */ #endif int tableMask=0; bool useBitReverseTable=false; bool useSinCosTable=false; void TableUsage(int iMask) { tableMask=iMask; useBitReverseTable=(iMask&1)!=0; useSinCosTable=(iMask&2)!=0; } SinCosTable::SinCosTable() : mSinCosTablePow(13) { size_t tableSize=1<>=1) if(i&maskLow) sSmallRBTable[i]|=maskHigh; } }; }; static BitReverser sBitReverser; /* array of functions there prob is a better way to do this */ int SmallVRB0(int bits) { return bits; }; int SmallVRB1(int bits) { return sSmallRBTable[bits]>>7; }; int SmallVRB2(int bits) { return sSmallRBTable[bits]>>6; }; int SmallVRB3(int bits) { return sSmallRBTable[bits]>>5; }; int SmallVRB4(int bits) { return sSmallRBTable[bits]>>4; }; int SmallVRB5(int bits) { return sSmallRBTable[bits]>>3; }; int SmallVRB6(int bits) { return sSmallRBTable[bits]>>2; }; int SmallVRB7(int bits) { return sSmallRBTable[bits]>>1; }; int SmallVRB8(int bits) { return sSmallRBTable[bits]; }; int SmallVRB9(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<1)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>7); }; int SmallVRB10(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<2)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>6); }; int SmallVRB11(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<3)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>5); }; int SmallVRB12(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<4)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>4); }; int SmallVRB13(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<5)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>3); }; int SmallVRB14(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<6)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>2); }; int SmallVRB15(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<7)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>1); }; int SmallVRB16(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<8)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]); }; int (*SmallVRB[])(int bits) = { SmallVRB0, SmallVRB1, SmallVRB2, SmallVRB3, SmallVRB4, SmallVRB5, SmallVRB6, SmallVRB7, SmallVRB8, SmallVRB9, SmallVRB10, SmallVRB11, SmallVRB12, SmallVRB13, SmallVRB14,SmallVRB15, SmallVRB16 }; int SmallRB(int bits, int numberBits) { // return ( (sSmallRBTable[*((unsigned char *)&bits)]<<16) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&bits)+2)] ))>>(24-numberBits); return ( (sSmallRBTable[*((unsigned char *)&bits)]<<8) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]) )>>(16-numberBits); }; /* wrapper functions. If passed -1 function choice will be made locally */ void RealFFTf1x(fft_type *buffer, FFTParam *h, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: RealFFTf1xSinCosTableVBR16( buffer, h); break; case FFT_SinCosTableBR16: RealFFTf1xSinCosTableBR16( buffer, h); break; case FFT_FastMathBR16: RealFFTf1xFastMathBR16( buffer, h); break; case FFT_FastMathBR24: RealFFTf1xFastMathBR24( buffer, h); break; case FFT_SinCosBRTable: default: RealFFTf1xSinCosBRTable( buffer, h); }; } void InverseRealFFTf1x(fft_type *buffer, FFTParam *h, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: InverseRealFFTf1xSinCosTableVBR16( buffer, h); break; case FFT_SinCosTableBR16: InverseRealFFTf1xSinCosTableBR16( buffer, h); break; case FFT_FastMathBR16: InverseRealFFTf1xFastMathBR16( buffer, h); break; case FFT_FastMathBR24: InverseRealFFTf1xFastMathBR24( buffer, h); break; case FFT_SinCosBRTable: default: InverseRealFFTf1xSinCosBRTable( buffer, h); }; } void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: ReorderToTime1xSinCosTableVBR16( hFFT, buffer, TimeOut); break; case FFT_SinCosTableBR16: ReorderToTime1xSinCosTableBR16( hFFT, buffer, TimeOut); break; case FFT_FastMathBR16: ReorderToTime1xFastMathBR16( hFFT, buffer, TimeOut); break; case FFT_FastMathBR24: ReorderToTime1xFastMathBR24( hFFT, buffer, TimeOut); break; case FFT_SinCosBRTable: default: ReorderToTime1xSinCosBRTable( hFFT, buffer, TimeOut); }; } void ReorderToFreq1x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: ReorderToFreq1xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut); break; case FFT_SinCosTableBR16: ReorderToFreq1xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut); break; case FFT_FastMathBR16: ReorderToFreq1xFastMathBR16(hFFT, buffer, RealOut, ImagOut); break; case FFT_FastMathBR24: ReorderToFreq1xFastMathBR24(hFFT, buffer, RealOut, ImagOut); break; case FFT_SinCosBRTable: default: ReorderToFreq1xSinCosBRTable(hFFT, buffer, RealOut, ImagOut); }; } void RealFFTf4x( fft_type *buffer, FFTParam *h, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: RealFFTf4xSinCosTableVBR16( buffer, h); break; case FFT_SinCosTableBR16: RealFFTf4xSinCosTableBR16( buffer, h); break; case FFT_FastMathBR16: RealFFTf4xFastMathBR16( buffer, h); break; case FFT_FastMathBR24: RealFFTf4xFastMathBR24( buffer, h); break; case FFT_SinCosBRTable: default: RealFFTf4xSinCosBRTable( buffer, h); }; } void InverseRealFFTf4x( fft_type *buffer, FFTParam *h, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: InverseRealFFTf4xSinCosTableVBR16( buffer, h); break; case FFT_SinCosTableBR16: InverseRealFFTf4xSinCosTableBR16( buffer, h); break; case FFT_FastMathBR16: InverseRealFFTf4xFastMathBR16( buffer, h); break; case FFT_FastMathBR24: InverseRealFFTf4xFastMathBR24( buffer, h); break; case FFT_SinCosBRTable: default: InverseRealFFTf4xSinCosBRTable( buffer, h); }; } void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: ReorderToTime4xSinCosTableVBR16( hFFT, buffer, TimeOut); break; case FFT_SinCosTableBR16: ReorderToTime4xSinCosTableBR16( hFFT, buffer, TimeOut); break; case FFT_FastMathBR16: ReorderToTime4xFastMathBR16( hFFT, buffer, TimeOut); break; case FFT_FastMathBR24: ReorderToTime4xFastMathBR24( hFFT, buffer, TimeOut); break; case FFT_SinCosBRTable: default: ReorderToTime4xSinCosBRTable( hFFT, buffer, TimeOut); }; } void ReorderToFreq4x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType) { switch(functionType) { case FFT_SinCosTableVBR16: ReorderToFreq4xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut); break; case FFT_SinCosTableBR16: ReorderToFreq4xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut); break; case FFT_FastMathBR16: ReorderToFreq4xFastMathBR16(hFFT, buffer, RealOut, ImagOut); break; case FFT_FastMathBR24: ReorderToFreq4xFastMathBR24(hFFT, buffer, RealOut, ImagOut); break; case FFT_SinCosBRTable: default: ReorderToFreq4xSinCosBRTable(hFFT, buffer, RealOut, ImagOut); }; } #define REAL_SINCOSBRTABLE #ifdef REAL_SINCOSBRTABLE /* * Forward FFT routine. Must call GetFFT(fftlen) first! * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ] * Imag_i = buffer[ h->BitReversed[i]+1 ] ) * Input is in normal order. * * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void RealFFTf1xSinCosBRTable(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *sptr; fft_type *endptr1,*endptr2; int *br1,*br2; fft_type HRplus,HRminus,HIplus,HIminus; fft_type v1,v2,sin,cos; auto ButterfliesPerGroup = h->Points / 2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; sptr = h->SinTable.get(); while(A < endptr1) { sin = *sptr; cos = *(sptr + 1); endptr2 = B; while(A < endptr2) { v1 = *B * cos + *(B+1) * sin; v2 = *B * sin - *(B+1) * cos; *B = (*A + v1); *(A++) = *(B++) - 2 * v1; *B = (*A - v2); *(A++) = *(B++) + 2 * v2; } A = B; B += ButterfliesPerGroup * 2; sptr += 2; } ButterfliesPerGroup >>= 1; } /* Massage output to get the output for a real input sequence. */ br1 = h->BitReversed.get() + 1; br2 = h->BitReversed.get() + h->Points - 1; while(br1 < br2) { sin=h->SinTable[*br1]; cos=h->SinTable[*br1+1]; A=buffer+*br1; B=buffer+*br2; HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus - cos*HIplus); v2 = (cos*HRminus + sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus + v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; br1++; br2--; } /* Handle the center bin (just need a conjugate) */ A=buffer+*br1+1; *A=-*A; /* Handle DC bin separately - and ignore the Fs/2 bin buffer[0]+=buffer[1]; buffer[1]=(fft_type)0;*/ /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=buffer[0]-buffer[1]; buffer[0]+=buffer[1]; buffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf1xSinCosBRTable(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *sptr; fft_type *endptr1,*endptr2; int *br1; fft_type HRplus,HRminus,HIplus,HIminus; fft_type v1,v2,sin,cos; auto ButterfliesPerGroup = h->Points / 2; /* Massage input to get the input for a real output sequence. */ A = buffer + 2; B = buffer + h->Points * 2 - 2; br1 = h->BitReversed.get() + 1; while(A < B) { sin=h->SinTable[*br1]; cos=h->SinTable[*br1+1]; HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus + cos*HIplus); v2 = (cos*HRminus - sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus - v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; A+=2; B-=2; br1++; } /* Handle center bin (just need conjugate) */ *(A+1)=-*(A+1); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=0.5f*(buffer[0]+buffer[1]); v2=0.5f*(buffer[0]-buffer[1]); buffer[0]=v1; buffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; sptr = h->SinTable.get(); while(A < endptr1) { sin = *(sptr++); cos = *(sptr++); endptr2 = B; while(A < endptr2) { v1 = *B * cos - *(B + 1) * sin; v2 = *B * sin + *(B + 1) * cos; *B = (*A + v1) * (fft_type)0.5; *(A++) = *(B++) - v1; *B = (*A + v2) * (fft_type)0.5; *(A++) = *(B++) - v2; } A = B; B += ButterfliesPerGroup * 2; } ButterfliesPerGroup >>= 1; } } void ReorderToFreq1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { RealOut[i]=buffer[hFFT->BitReversed[i] ]; ImagOut[i]=buffer[hFFT->BitReversed[i]+1]; } RealOut[0] = buffer[0]; // DC component ImagOut[0] = 0; RealOut[hFFT->Points] = buffer[1]; // Fs/2 component ImagOut[hFFT->Points] = 0; } void ReorderToTime1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { TimeOut[i*2 ]=buffer[hFFT->BitReversed[i] ]; TimeOut[i*2+1]=buffer[hFFT->BitReversed[i]+1]; } } // 4x processing simd void RealFFTf4xSinCosBRTable(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; fft_type *sptr; __m128 *endptr1,*endptr2; int br1Index, br2Index; int br1Value, br2Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; auto ButterfliesPerGroup = h->Points / 2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = &localBuffer[h->Points * 2]; while(ButterfliesPerGroup > 0) { A = localBuffer; B = &localBuffer[ButterfliesPerGroup * 2]; sptr = h->SinTable.get(); while(A < endptr1) { sin = _mm_set1_ps(*(sptr++)); cos = _mm_set1_ps(*(sptr++)); endptr2 = B; while(A < endptr2) { v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin)); v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos)); *B = _mm_add_ps( *A, v1); __m128 temp128 = _mm_set1_ps( 2.0); *(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1)); *B = _mm_sub_ps(*A,v2); *(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2)); } A = B; B = &B[ButterfliesPerGroup * 2]; } ButterfliesPerGroup >>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; // h->BitReversed + 1; br2Index = h->Points - 1; //h->BitReversed + h->Points - 1; while(br1IndexBitReversed[br1Index]; br2Value=h->BitReversed[br2Index]; sin=_mm_set1_ps(h->SinTable[br1Value]); cos=_mm_set1_ps(h->SinTable[br1Value+1]); A=&localBuffer[br1Value]; B=&localBuffer[br2Value]; __m128 temp128 = _mm_set1_ps( 2.0); HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128)); HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128)); v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); temp128 = _mm_set1_ps( 0.5); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128); *(B+1) = _mm_sub_ps(*(A+1), HIminus); br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ A=&localBuffer[h->BitReversed[br1Index]+1]; // negate sse style *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); localBuffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf4xSinCosBRTable(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; fft_type *sptr; __m128 *endptr1,*endptr2; int br1Index, br1Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; auto ButterfliesPerGroup = h->Points / 2; /* Massage input to get the input for a real output sequence. */ A = localBuffer + 2; B = localBuffer + h->Points * 2 - 2; br1Index = 1; //h->BitReversed + 1; while(A < B) { br1Value = h->BitReversed[br1Index]; sin = _mm_set1_ps(h->SinTable[br1Value]); cos = _mm_set1_ps(h->SinTable[br1Value + 1]); HRminus = _mm_sub_ps(*A, *B); HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0))); HIminus = _mm_sub_ps( *(A+1), *(B+1)); HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0))); v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5)); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5)); *(B+1) = _mm_sub_ps(*(A+1), HIminus); A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ // negate sse style *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f)); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1])); v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1])); localBuffer[0]=v1; localBuffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = localBuffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = localBuffer; B = localBuffer + ButterfliesPerGroup * 2; sptr = h->SinTable.get(); while(A < endptr1) { sin = _mm_set1_ps(*(sptr++)); cos = _mm_set1_ps(*(sptr++)); endptr2 = B; while(A < endptr2) { v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B + 1), sin)); v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B + 1), cos)); *B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5)); *(A++) = _mm_sub_ps(*(B++), v1); *B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5)); *(A++) = _mm_sub_ps(*(B++), v2); } A = B; B = &B[ButterfliesPerGroup * 2]; } ButterfliesPerGroup >>= 1; } } void ReorderToFreq4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { __m128 *localBuffer=(__m128 *)buffer; __m128 *localRealOut=(__m128 *)RealOut; __m128 *localImagOut=(__m128 *)ImagOut; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; brValue=hFFT->BitReversed[i]; localRealOut[i]=localBuffer[brValue ]; localImagOut[i]=localBuffer[brValue+1]; } localRealOut[0] = localBuffer[0]; // DC component localImagOut[0] = _mm_set1_ps(0.0); localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component localImagOut[hFFT->Points] = _mm_set1_ps(0.0); } void ReorderToTime4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { __m128 *localBuffer=(__m128 *)buffer; __m128 *localTimeOut=(__m128 *)TimeOut; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; brValue = hFFT->BitReversed[i]; localTimeOut[i*2 ] = localBuffer[brValue ]; localTimeOut[i*2+1] = localBuffer[brValue+1]; } } #endif #define REAL_SINCOSTABLE_VBR16 #ifdef REAL_SINCOSTABLE_VBR16 /* * Forward FFT routine. Must call GetFFT(fftlen) first! * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ] * Imag_i = buffer[ h->BitReversed[i]+1 ] ) * Input is in normal order. * * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void RealFFTf1xSinCosTableVBR16(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *endptr1,*endptr2; int br1Index, br2Index; int br1Value, br2Value; fft_type HRplus,HRminus,HIplus,HIminus; fft_type v1,v2,sin,cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1); endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int iSinCosIndex = 0; while(A < endptr1) { int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; br2Index = h->Points - 1; while(br1Index < br2Index) { br1Value=(*SmallVRB[h->pow2Bits])(br1Index); br2Value=(*SmallVRB[h->pow2Bits])(br2Index); int sinCosIndex=br1Index<pow2Bits])(br1Index)+1]; /* Handle DC bin separately - and ignore the Fs/2 bin buffer[0]+=buffer[1]; buffer[1]=(fft_type)0;*/ /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=buffer[0]-buffer[1]; buffer[0]+=buffer[1]; buffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf1xSinCosTableVBR16(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *endptr1,*endptr2; int br1Index, br1Value; int *br1; fft_type HRplus,HRminus,HIplus,HIminus; fft_type v1,v2,sin,cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1); /* Massage input to get the input for a real output sequence. */ A = buffer + 2; B = buffer + h->Points * 2 - 2; br1 = h->BitReversed.get() + 1; br1Index = 1; //h->BitReversed + 1; while(A < B) { br1Value = (*SmallVRB[h->pow2Bits])(br1Index); int sinCosIndex = br1Index << sinCosShift; sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin; cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos; HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus + cos*HIplus); v2 = (cos*HRminus - sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus - v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ *(A+1)=-*(A+1); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=0.5f*(buffer[0]+buffer[1]); v2=0.5f*(buffer[0]-buffer[1]); buffer[0]=v1; buffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int iSinCosIndex = 0; while(A < endptr1) { int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift; sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin; cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos; iSinCosIndex++; endptr2 = B; while(A < endptr2) { v1 = *B * cos - *(B + 1) * sin; v2 = *B * sin + *(B + 1) * cos; *B = (*A + v1) * (fft_type)0.5; *(A++) = *(B++) - v1; *B = (*A + v2) * (fft_type)0.5; *(A++) = *(B++) - v2; } A = B; B += ButterfliesPerGroup * 2; } ButterfliesPerGroup >>= 1; } } void ReorderToTime1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { // Copy the data into the real outputs for(size_t i = 0;i < hFFT->Points; i++) { int brValue; brValue=(*SmallVRB[hFFT->pow2Bits])(i); TimeOut[i*2 ] = buffer[brValue ]; TimeOut[i*2+1] = buffer[brValue+1]; } } void ReorderToFreq1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; brValue = (*SmallVRB[hFFT->pow2Bits])(i); RealOut[i] = buffer[brValue ]; ImagOut[i] = buffer[brValue+1]; } RealOut[0] = buffer[0]; // DC component ImagOut[0] = 0; RealOut[hFFT->Points] = buffer[1]; // Fs/2 component ImagOut[hFFT->Points] = 0; } // 4x processing simd void RealFFTf4xSinCosTableVBR16(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; __m128 *endptr1,*endptr2; int br1Index, br2Index; int br1Value, br2Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1); /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = &localBuffer[h->Points * 2]; while(ButterfliesPerGroup > 0) { A = localBuffer; B = &localBuffer[ButterfliesPerGroup * 2]; int iSinCosIndex = 0; while(A < endptr1) { int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift; sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin); cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos); iSinCosIndex++; endptr2 = B; while(A < endptr2) { v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin)); v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos)); *B = _mm_add_ps( *A, v1); __m128 temp128 = _mm_set1_ps( 2.0); *(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1)); *B = _mm_sub_ps(*A,v2); *(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2)); } A = B; B = &B[ButterfliesPerGroup * 2]; } ButterfliesPerGroup >>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; // h->BitReversed + 1; br2Index = h->Points - 1; //h->BitReversed + h->Points - 1; while(br1Index < br2Index) { br1Value=(*SmallVRB[h->pow2Bits])(br1Index); br2Value=(*SmallVRB[h->pow2Bits])(br2Index); int sinCosIndex=br1Index<pow2Bits])(br1Index)+1]; // negate sse style *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); localBuffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf4xSinCosTableVBR16(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A, *B; __m128 *endptr1, *endptr2; int br1Index, br1Value; __m128 HRplus, HRminus, HIplus, HIminus; __m128 v1, v2, sin, cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1); /* Massage input to get the input for a real output sequence. */ A = localBuffer + 2; B = localBuffer + h->Points * 2 - 2; br1Index = 1; //h->BitReversed + 1; while(A < B) { br1Value = (*SmallVRB[h->pow2Bits])(br1Index); int sinCosIndex = br1Index << sinCosShift; sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin); cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos); HRminus = _mm_sub_ps(*A, *B); HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0))); HIminus = _mm_sub_ps( *(A+1), *(B+1)); HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0))); v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5)); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5)); *(B+1) = _mm_sub_ps(*(A+1), HIminus); A = &A[2]; B = &B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ // negate sse style *(A+1) = _mm_xor_ps(*(A+1), _mm_set1_ps(-0.f)); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1])); v2 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1])); localBuffer[0] = v1; localBuffer[1] = v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = localBuffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = localBuffer; B = localBuffer + ButterfliesPerGroup * 2; int iSinCosIndex = 0; while(A < endptr1) { int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift; sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin); cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos); iSinCosIndex++; endptr2 = B; while(A < endptr2) { v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin)); v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos)); *B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5)); *(A++) = _mm_sub_ps(*(B++), v1); *B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5)); *(A++) = _mm_sub_ps(*(B++),v2); } A = B; B = &B[ButterfliesPerGroup * 2]; } ButterfliesPerGroup >>= 1; } } void ReorderToTime4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { __m128 *localBuffer = (__m128 *)buffer; __m128 *localTimeOut = (__m128 *)TimeOut; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; brValue = (*SmallVRB[hFFT->pow2Bits])(i); localTimeOut[i*2 ] = localBuffer[brValue ]; localTimeOut[i*2+1] = localBuffer[brValue+1]; } } void ReorderToFreq4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { __m128 *localBuffer = (__m128 *)buffer; __m128 *localRealOut = (__m128 *)RealOut; __m128 *localImagOut = (__m128 *)ImagOut; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; brValue = (*SmallVRB[hFFT->pow2Bits])(i); localRealOut[i] = localBuffer[brValue ]; localImagOut[i] = localBuffer[brValue+1]; } localRealOut[0] = localBuffer[0]; // DC component localImagOut[0] = _mm_set1_ps(0.0); localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component localImagOut[hFFT->Points] = _mm_set1_ps(0.0); } #endif #define REAL_SINCOSTABLE_BR16 #ifdef REAL_SINCOSTABLE_BR16 /* * Forward FFT routine. Must call GetFFT(fftlen) first! * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ] * Imag_i = buffer[ h->BitReversed[i]+1 ] ) * Input is in normal order. * * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void RealFFTf1xSinCosTableBR16(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *endptr1, *endptr2; int br1Index, br2Index; int br1Value, br2Value; fft_type HRplus, HRminus, HIplus, HIminus; fft_type v1, v2, sin, cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int bitReverseShiftM1 = 17 - h->pow2Bits; int bitReverseShift = bitReverseShiftM1 - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1); endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int iSinCosIndex = 0; while(A < endptr1) { // int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<>bitReverseShiftM1)<>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; br2Index = h->Points - 1; while(br1Index < br2Index) { br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index); br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index); int sinCosIndex = br1Index << sinCosShift; sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin; cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos; A = &buffer[br1Value]; B = &buffer[br2Value]; HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus - cos*HIplus); v2 = (cos*HRminus + sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus + v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ // A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1]; A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1]; /* Handle DC bin separately - and ignore the Fs/2 bin buffer[0]+=buffer[1]; buffer[1]=(fft_type)0;*/ /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=buffer[0]-buffer[1]; buffer[0]+=buffer[1]; buffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf1xSinCosTableBR16(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *endptr1,*endptr2; int br1Index; fft_type HRplus, HRminus, HIplus, HIminus; fft_type v1, v2, sin, cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1); int bitReverseShiftM1 = 17 - h->pow2Bits; /* Massage input to get the input for a real output sequence. */ A = buffer + 2; B = buffer + h->Points * 2 - 2; br1Index = 1; //h->BitReversed + 1; while(A < B) { int sinCosIndex = br1Index << sinCosShift; sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin; cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos; HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus + cos*HIplus); v2 = (cos*HRminus - sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus - v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ *(A+1)=-*(A+1); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=0.5f*(buffer[0]+buffer[1]); v2=0.5f*(buffer[0]-buffer[1]); buffer[0]=v1; buffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int iSinCosIndex = 0; while(A < endptr1) { // int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<>bitReverseShiftM1)<>= 1; } } void ReorderToFreq1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { int bitReverseShift=16-hFFT->pow2Bits; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; // brValue=(*SmallVRB[hFFT->pow2Bits])(i); brValue = ( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); RealOut[i] = buffer[brValue ]; ImagOut[i] = buffer[brValue+1]; } RealOut[0] = buffer[0]; // DC component ImagOut[0] = 0; RealOut[hFFT->Points] = buffer[1]; // Fs/2 component ImagOut[hFFT->Points] = 0; } void ReorderToTime1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { int bitReverseShift=16-hFFT->pow2Bits; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; // brValue=(*SmallVRB[hFFT->pow2Bits])(i); brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); TimeOut[i*2 ] = buffer[brValue ]; TimeOut[i*2+1] = buffer[brValue+1]; } } // 4x processing simd void RealFFTf4xSinCosTableBR16(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A, *B; __m128 *endptr1, *endptr2; int br1Index, br2Index; int br1Value, br2Value; __m128 HRplus, HRminus, HIplus, HIminus; __m128 v1, v2, sin, cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1); int bitReverseShiftM1 = 17 - h->pow2Bits; int bitReverseShift = bitReverseShiftM1 - 1; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = &localBuffer[h->Points * 2]; while(ButterfliesPerGroup > 0) { A = localBuffer; B = &localBuffer[ButterfliesPerGroup * 2]; int iSinCosIndex = 0; while(A < endptr1) { // int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<>bitReverseShiftM1)<>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; // h->BitReversed + 1; br2Index = h->Points - 1; //h->BitReversed + h->Points - 1; while(br1Index < br2Index) { br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index); br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index); int sinCosIndex=br1Index<pow2Bits])(br1Index)+1]; A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1]; // negate sse style *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); localBuffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf4xSinCosTableBR16(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A, *B; __m128 *endptr1, *endptr2; int br1Index; __m128 HRplus, HRminus, HIplus, HIminus; __m128 v1, v2, sin, cos; auto ButterfliesPerGroup = h->Points / 2; int pow2BitsMinus1 = h->pow2Bits - 1; int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1); int bitReverseShiftM1 = 17 - h->pow2Bits; /* Massage input to get the input for a real output sequence. */ A = localBuffer + 2; B = localBuffer + h->Points * 2 - 2; br1Index = 1; //h->BitReversed + 1; while(A < B) { int sinCosIndex = br1Index << sinCosShift; sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin); cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos); HRminus = _mm_sub_ps(*A, *B); HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0))); HIminus = _mm_sub_ps( *(A+1), *(B+1)); HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0))); v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5)); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5)); *(B+1) = _mm_sub_ps(*(A+1), HIminus); A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ // negate sse style *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f)); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1])); v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1])); localBuffer[0] = v1; localBuffer[1] = v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = localBuffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = localBuffer; B = localBuffer + ButterfliesPerGroup * 2; int iSinCosIndex = 0; while(A < endptr1) { // int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<>bitReverseShiftM1)<>= 1; } } void ReorderToTime4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { __m128 *localBuffer = (__m128 *)buffer; __m128 *localTimeOut = (__m128 *)TimeOut; int bitReverseShift = 16 - hFFT->pow2Bits; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); localTimeOut[i*2 ] = localBuffer[brValue ]; localTimeOut[i*2+1] = localBuffer[brValue+1]; } } void ReorderToFreq4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { __m128 *localBuffer = (__m128 *)buffer; __m128 *localRealOut = (__m128 *)RealOut; __m128 *localImagOut = (__m128 *)ImagOut; int bitReverseShift = 16 - hFFT->pow2Bits; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); localRealOut[i] = localBuffer[brValue ]; localImagOut[i] = localBuffer[brValue+1]; } localRealOut[0] = localBuffer[0]; // DC component localImagOut[0] = _mm_set1_ps(0.0); localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component localImagOut[hFFT->Points] = _mm_set1_ps(0.0); } #endif #define FAST_MATH_BR24 #ifdef FAST_MATH_BR24 /* * Forward FFT routine. Must call GetFFT(fftlen) first! * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ] * Imag_i = buffer[ h->BitReversed[i]+1 ] ) * Input is in normal order. * * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void RealFFTf1xFastMathBR24(fft_type *buffer, FFTParam *h) { fft_type *A, *B; fft_type *endptr1, *endptr2; int br1Index, br2Index; int br1Value, br2Value; fft_type HRplus, HRminus, HIplus, HIminus; fft_type v1, v2, sin, cos; fft_type iToRad = 2 * M_PI/(2 * h->Points); int bitReverseShift = 24 - h->pow2Bits; int bitReverseShiftM1 = bitReverseShift + 1; auto ButterfliesPerGroup = h->Points / 2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; const v4sf zeroes = {0.0,0.0,0.0,0.0}; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { //v4sf vx=zeroes; // <-- If we want to suppress the C4701 warning later. v4sf vx; for(int i=0;i<4;i++) { int brTemp=iSinCosIndex+i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } //"Warning C4701 potentially uninitialized local variable 'vx' " is OK. //vx is initialised component by component, and MSVC doesn't realize. sincos_ps(vx, &sin4_2, &cos4_2); sin=-sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; // h->BitReversed + 1; br2Index = h->Points - 1; //h->BitReversed + h->Points - 1; int sinCosCalIndex = 0; while(br1Index < br2Index) { v4sf sin4_2, cos4_2; br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index); br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index); if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(vx, &sin4_2, &cos4_2); sin=-sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } A=&buffer[br1Value]; B=&buffer[br2Value]; HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus - cos*HIplus); v2 = (cos*HRminus + sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus + v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ // A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1]; A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1]; /* Handle DC bin separately - and ignore the Fs/2 bin buffer[0]+=buffer[1]; buffer[1]=(fft_type)0;*/ /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=buffer[0]-buffer[1]; buffer[0]+=buffer[1]; buffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf1xFastMathBR24(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *endptr1,*endptr2; int br1Index; fft_type HRplus, HRminus, HIplus, HIminus; fft_type v1, v2, sin, cos; fft_type iToRad = 2 * M_PI / (2 * h->Points); int bitReverseShiftM1 = 25 - h->pow2Bits; auto ButterfliesPerGroup = h->Points / 2; /* Massage input to get the input for a real output sequence. */ A = buffer + 2; B = buffer + h->Points * 2 - 2; br1Index = 1; //h->BitReversed + 1; int sinCosCalIndex = 0; while(A < B) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(vx, &sin4_2, &cos4_2); sin=-sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus + cos*HIplus); v2 = (cos*HRminus - sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus - v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ *(A+1)=-*(A+1); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=0.5f*(buffer[0]+buffer[1]); v2=0.5f*(buffer[0]-buffer[1]); buffer[0]=v1; buffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) { int brTemp=iSinCosIndex+i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } sincos_ps(vx, &sin4_2, &cos4_2); sin=-sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } } void ReorderToFreq1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { int bitReverseShift = 24 - hFFT->pow2Bits; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; // brValue=(*SmallVRB[hFFT->pow2Bits])(i); brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift); RealOut[i] = buffer[brValue ]; ImagOut[i] = buffer[brValue+1]; } RealOut[0] = buffer[0]; // DC component ImagOut[0] = 0; RealOut[hFFT->Points] = buffer[1]; // Fs/2 component ImagOut[hFFT->Points] = 0; } void ReorderToTime1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { int bitReverseShift = 24 - hFFT->pow2Bits; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); TimeOut[i*2 ] = buffer[brValue ]; TimeOut[i*2+1] = buffer[brValue+1]; } } void RealFFTf4xFastMathBR24(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; __m128 *endptr1,*endptr2; int br1Index, br2Index; int br1Value, br2Value; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; fft_type iToRad = 2 * M_PI/(2 * h->Points); auto ButterfliesPerGroup = h->Points / 2; int bitReverseShift = 24 - h->pow2Bits; int bitReverseShiftM1 = bitReverseShift + 1; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = &localBuffer[h->Points * 2]; while(ButterfliesPerGroup > 0) { A = localBuffer; B = &localBuffer[ButterfliesPerGroup * 2]; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) { int brTemp=iSinCosIndex+i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } sincos_ps(vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); sinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]); if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; // h->BitReversed+1; br2Index = h->Points - 1; //h->BitReversed + h->Points - 1; int sinCosCalIndex = 0; while(br1Index < br2Index) { v4sf sin4_2, cos4_2; br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index); br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index); if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); sinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]); if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } A=&localBuffer[br1Value]; B=&localBuffer[br2Value]; __m128 temp128 = _mm_set1_ps( 2.0); HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128)); HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128)); v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); temp128 = _mm_set1_ps( 0.5); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128); *(B+1) = _mm_sub_ps(*(A+1), HIminus); br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ // A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1]; A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1]; // negate sse style *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); localBuffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf4xFastMathBR24(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A,*B; __m128 *endptr1,*endptr2; int br1Index; __m128 HRplus,HRminus,HIplus,HIminus; __m128 v1,v2,sin,cos; fft_type iToRad = 2 * M_PI/(2 * h->Points); int bitReverseShiftM1 = 25 - h->pow2Bits; auto ButterfliesPerGroup = h->Points / 2; /* Massage input to get the input for a real output sequence. */ A = localBuffer + 2; B = localBuffer + h->Points * 2 - 2; br1Index = 1; //h->BitReversed + 1; int sinCosCalIndex = 0; while(A < B) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); sinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]); if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } HRminus = _mm_sub_ps(*A, *B); HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0))); HIminus = _mm_sub_ps( *(A+1), *(B+1)); HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0))); v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5)); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5)); *(B+1) = _mm_sub_ps(*(A+1), HIminus); A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ // negate sse style *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f)); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1])); v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1])); localBuffer[0]=v1; localBuffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = localBuffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = localBuffer; B = localBuffer + ButterfliesPerGroup * 2; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) { int brTemp=iSinCosIndex+i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } sincos_ps(vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); sinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]); if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } } void ReorderToFreq4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { __m128 *localBuffer = (__m128 *)buffer; __m128 *localRealOut = (__m128 *)RealOut; __m128 *localImagOut = (__m128 *)ImagOut; int bitReverseShift = 24-hFFT->pow2Bits; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); localRealOut[i]=localBuffer[brValue ]; localImagOut[i]=localBuffer[brValue+1]; } localRealOut[0] = localBuffer[0]; // DC component localImagOut[0] = _mm_set1_ps(0.0); localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component localImagOut[hFFT->Points] = _mm_set1_ps(0.0); } void ReorderToTime4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { __m128 *localBuffer = (__m128 *)buffer; __m128 *localTimeOut = (__m128 *)TimeOut; int bitReverseShift = 24-hFFT->pow2Bits; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); localTimeOut[i*2 ] = localBuffer[brValue ]; localTimeOut[i*2+1] = localBuffer[brValue+1]; } } #endif #define FAST_MATH_BR16 #ifdef FAST_MATH_BR16 /* * Forward FFT routine. Must call GetFFT(fftlen) first! * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ] * Imag_i = buffer[ h->BitReversed[i]+1 ] ) * Input is in normal order. * * Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void RealFFTf1xFastMathBR16(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *endptr1,*endptr2; int br1Index, br2Index; int br1Value, br2Value; fft_type HRplus,HRminus,HIplus,HIminus; fft_type v1,v2,sin,cos; fft_type iToRad = 2 * M_PI / (2 * h->Points); int bitReverseShiftM1 = 17 - h->pow2Bits; int bitReverseShift = bitReverseShiftM1 - 1; auto ButterfliesPerGroup = h->Points / 2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) { int brTemp=iSinCosIndex+i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } sincos_ps(vx, &sin4_2, &cos4_2); sin=-sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; // h->BitReversed+1; br2Index = h->Points - 1; //h->BitReversed + h->Points - 1; int sinCosCalIndex = 0; while(br1Index < br2Index) { v4sf sin4_2, cos4_2; br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index); br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index); if(!sinCosCalIndex) { v4sf vx; for(int i = 0; i < 4; i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(vx, &sin4_2, &cos4_2); sin = -sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } A=&buffer[br1Value]; B=&buffer[br2Value]; HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus - cos*HIplus); v2 = (cos*HRminus + sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus + v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ // A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1]; A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1]; /* Handle DC bin separately - and ignore the Fs/2 bin buffer[0]+=buffer[1]; buffer[1]=(fft_type)0;*/ /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=buffer[0]-buffer[1]; buffer[0]+=buffer[1]; buffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf1xFastMathBR16(fft_type *buffer, FFTParam *h) { fft_type *A,*B; fft_type *endptr1,*endptr2; int br1Index; fft_type HRplus,HRminus,HIplus,HIminus; fft_type v1,v2,sin,cos; fft_type iToRad=2 * M_PI / (2 * h->Points); int bitReverseShiftM1=17-h->pow2Bits; auto ButterfliesPerGroup = h->Points / 2; /* Massage input to get the input for a real output sequence. */ A = buffer + 2; B = buffer + h->Points * 2 - 2; br1Index = 1; //h->BitReversed + 1; int sinCosCalIndex = 0; while(A < B) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) vx.m128_f32[i]=((float)(br1Index+i))*iToRad; sincos_ps(vx, &sin4_2, &cos4_2); sin=-sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } HRplus = (HRminus = *A - *B ) + (*B * 2); HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2); v1 = (sin*HRminus + cos*HIplus); v2 = (cos*HRminus - sin*HIplus); *A = (HRplus + v1) * (fft_type)0.5; *B = *A - v1; *(A+1) = (HIminus - v2) * (fft_type)0.5; *(B+1) = *(A+1) - HIminus; A=&A[2]; B=&B[-2]; br1Index++; } /* Handle center bin (just need conjugate) */ *(A+1)=-*(A+1); /* Handle DC bin separately - this ignores any Fs/2 component buffer[1]=buffer[0]=buffer[0]/2;*/ /* Handle DC and Fs/2 bins specially */ /* The DC bin is passed in as the real part of the DC complex value */ /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */ /* (v1+v2) = buffer[0] == the DC component */ /* (v1-v2) = buffer[1] == the Fs/2 component */ v1=0.5f*(buffer[0]+buffer[1]); v2=0.5f*(buffer[0]-buffer[1]); buffer[0]=v1; buffer[1]=v2; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = buffer + h->Points * 2; while(ButterfliesPerGroup > 0) { A = buffer; B = buffer + ButterfliesPerGroup * 2; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i = 0; i < 4; i++) { int brTemp = iSinCosIndex + i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } sincos_ps(vx, &sin4_2, &cos4_2); sin=-sin4_2.m128_f32[0]; cos=-cos4_2.m128_f32[0]; sinCosCalIndex++; } else { sin=-sin4_2.m128_f32[sinCosCalIndex]; cos=-cos4_2.m128_f32[sinCosCalIndex]; if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } } void ReorderToFreq1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { int bitReverseShift = 16 - hFFT->pow2Bits; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; // brValue=(*SmallVRB[hFFT->pow2Bits])(i); brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); RealOut[i] = buffer[brValue ]; ImagOut[i] = buffer[brValue+1]; } RealOut[0] = buffer[0]; // DC component ImagOut[0] = 0; RealOut[hFFT->Points] = buffer[1]; // Fs/2 component ImagOut[hFFT->Points] = 0; } void ReorderToTime1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { int bitReverseShift=16-hFFT->pow2Bits; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); TimeOut[i*2 ] = buffer[brValue ]; TimeOut[i*2+1] = buffer[brValue+1]; } } void RealFFTf4xFastMathBR16(fft_type *buffer, FFTParam *h) { __m128 *localBuffer = (__m128 *)buffer; __m128 *A,*B; __m128 *endptr1, *endptr2; int br1Index, br2Index; int br1Value, br2Value; __m128 HRplus, HRminus, HIplus, HIminus; __m128 v1, v2, sin, cos; fft_type iToRad = 2 * M_PI/(2 * h->Points); auto ButterfliesPerGroup = h->Points / 2; int bitReverseShiftM1 = 17 - h->pow2Bits; int bitReverseShift = bitReverseShiftM1 - 1; /* * Butterfly: * Ain-----Aout * \ / * / \ * Bin-----Bout */ endptr1 = &localBuffer[h->Points * 2]; while(ButterfliesPerGroup > 0) { A = localBuffer; B = &localBuffer[ButterfliesPerGroup * 2]; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) { int brTemp=iSinCosIndex+i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } sincos_ps(vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); sinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]); if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } /* Massage output to get the output for a real input sequence. */ br1Index = 1; // h->BitReversed + 1; br2Index = h->Points - 1; //h->BitReversed + h->Points - 1; int sinCosCalIndex = 0; while(br1Index < br2Index) { v4sf sin4_2, cos4_2; br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index); br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index); if(!sinCosCalIndex) { v4sf vx; for(int i = 0; i < 4; i++) vx.m128_f32[i] = ((float)(br1Index+i)) * iToRad; sincos_ps(vx, &sin4_2, &cos4_2); sin = _mm_set1_ps(-sin4_2.m128_f32[0]); cos = _mm_set1_ps(-cos4_2.m128_f32[0]); sinCosCalIndex++; } else { sin = _mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]); cos = _mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]); if(sinCosCalIndex == 3) sinCosCalIndex = 0; else sinCosCalIndex++; } A = &localBuffer[br1Value]; B = &localBuffer[br2Value]; __m128 temp128 = _mm_set1_ps( 2.0); HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128)); HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128)); v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus)); v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus)); temp128 = _mm_set1_ps( 0.5); *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128); *B = _mm_sub_ps(*A, v1); *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128); *(B+1) = _mm_sub_ps(*(A+1), HIminus); br1Index++; br2Index--; } /* Handle the center bin (just need a conjugate) */ // A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1]; A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1]; // negate sse style *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f)); /* Handle DC and Fs/2 bins separately */ /* Put the Fs/2 value into the imaginary part of the DC bin */ v1=_mm_sub_ps(localBuffer[0], localBuffer[1]); localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]); localBuffer[1]=v1; } /* Description: This routine performs an inverse FFT to real data. * This code is for floating point data. * * Note: Output is BIT-REVERSED! so you must use the BitReversed to * get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ] * wave[2*i+1] = buffer[ BitReversed[i]+1 ] ) * Input is in normal order, interleaved (real,imaginary) complex data * You must call GetFFT(fftlen) first to initialize some buffers! * * Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin * - this can be done because both values will always be real only * - this allows us to not have to allocate an extra complex value for the Fs/2 bin * * Note: The scaling on this is done according to the standard FFT definition, * so a unit amplitude DC signal will output an amplitude of (N) * (Older revisions would progressively scale the input, so the output * values would be similar in amplitude to the input values, which is * good when using fixed point arithmetic) */ void InverseRealFFTf4xFastMathBR16(fft_type *buffer, FFTParam *h) { __m128 *localBuffer=(__m128 *)buffer; __m128 *A, *B; __m128 *endptr1, *endptr2; int br1Index; __m128 HRplus, HRminus, HIplus, HIminus; __m128 v1, v2, sin, cos; fft_type iToRad = 2 * M_PI/(2 * h->Points); int bitReverseShiftM1 = 17 - h->pow2Bits; auto ButterfliesPerGroup = h->Points / 2; /* Massage input to get the input for a real output sequence. */ A = localBuffer + 2; B = localBuffer + h->Points * 2 - 2; br1Index=1; //h->BitReversed+1; int sinCosCalIndex=0; while(APoints * 2; while(ButterfliesPerGroup > 0) { A = localBuffer; B = localBuffer + ButterfliesPerGroup * 2; int sinCosCalIndex = 0; int iSinCosIndex = 0; while(A < endptr1) { v4sf sin4_2, cos4_2; if(!sinCosCalIndex) { v4sf vx; for(int i=0;i<4;i++) { int brTemp=iSinCosIndex+i; vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad; // vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad; } sincos_ps(vx, &sin4_2, &cos4_2); sin=_mm_set1_ps(-sin4_2.m128_f32[0]); cos=_mm_set1_ps(-cos4_2.m128_f32[0]); sinCosCalIndex++; } else { sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]); cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]); if(sinCosCalIndex==3) sinCosCalIndex=0; else sinCosCalIndex++; } iSinCosIndex++; endptr2=B; while(A>= 1; } } void ReorderToFreq4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut) { __m128 *localBuffer=(__m128 *)buffer; __m128 *localRealOut=(__m128 *)RealOut; __m128 *localImagOut=(__m128 *)ImagOut; int bitReverseShift=16-hFFT->pow2Bits; // Copy the data into the real and imaginary outputs for(size_t i = 1; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); localRealOut[i]=localBuffer[brValue ]; localImagOut[i]=localBuffer[brValue+1]; } localRealOut[0] = localBuffer[0]; // DC component localImagOut[0] = _mm_set1_ps(0.0); localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component localImagOut[hFFT->Points] = _mm_set1_ps(0.0); } void ReorderToTime4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut) { __m128 *localBuffer=(__m128 *)buffer; __m128 *localTimeOut=(__m128 *)TimeOut; int bitReverseShift=16-hFFT->pow2Bits; // Copy the data into the real outputs for(size_t i = 0; i < hFFT->Points; i++) { int brValue; brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift); // brValue=(*SmallVRB[hFFT->pow2Bits])(i); localTimeOut[i*2 ] = localBuffer[brValue ]; localTimeOut[i*2+1] = localBuffer[brValue+1]; } } #endif #endif