1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-06-16 16:10:06 +02:00
audacity/src/effects/Equalization48x.cpp
Leland Lucius 8fbfa460c4 Migrating the remaining effects
This brings the builtin, LV2, and VAMP effects inline with the
Audio Units, LADSPA, and VST effects.  All effects now share
a common UI.

This gives all effects (though not implemented for all):

User and factory preset capability
Preset import/export capability
Shared or private configuration options

Builtin effects can now be migrated to RTP, depending on algorithm.
LV2 effects now support graphical interfaces if the plugin supplies one.
Nyquist prompt enhanced to provide some features of the Nyquist Workbench.

It may not look like it, but this was a LOT of work, so trust me, there
WILL be problems and everything effect related should be suspect.  Keep
a sharp eye (or two) open.
2015-04-16 23:36:28 -05:00

1327 lines
55 KiB
C++

/**********************************************************************
Audacity: A Digital Audio Editor
EffectEqualization.cpp
Andrew Hallendorff
*******************************************************************//**
\file Equalization48x.cpp
\brief Fast SSE based implementation of equalization.
*//****************************************************************/
#include "../Audacity.h"
#include "../Project.h"
#ifdef EXPERIMENTAL_EQ_SSE_THREADED
#include "Equalization.h"
#include "../WaveTrack.h"
#include "float_cast.h"
#include <vector>
#include <wx/dcmemory.h>
#include <wx/event.h>
#include <wx/string.h>
#if wxUSE_TOOLTIPS
#include <wx/tooltip.h>
#endif
#include <wx/utils.h>
#include <math.h>
#include <wx/arrimpl.cpp>
#include "Equalization48x.h"
#include "../RealFFTf.h"
#include "../RealFFTf48x.h"
#ifndef USE_SSE2
#define USE_SSE2
#endif
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <math.h>
#include <emmintrin.h>
#ifdef _WIN32
// Windows
#include <intrin.h>
#define cpuid __cpuid
#else
// GCC Inline Assembly
void cpuid(int CPUInfo[4],int InfoType){
__asm__ __volatile__ (
"cpuid":
"=a" (CPUInfo[0]),
"=b" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3]) :
"a" (InfoType)
);
}
#endif
bool sMathCapsInitialized = false;
MathCaps sMathCaps;
// dirty switcher
int sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
void EffectEqualization48x::SetMathPath(int mathPath) { sMathPath=mathPath; };
int EffectEqualization48x::GetMathPath() { return sMathPath; };
void EffectEqualization48x::AddMathPathOption(int mathPath) { sMathPath|=mathPath; };
void EffectEqualization48x::RemoveMathPathOption(int mathPath) { sMathPath&=~mathPath; };
MathCaps *EffectEqualization48x::GetMathCaps()
{
if(!sMathCapsInitialized)
{
sMathCapsInitialized=true;
sMathCaps.x64 = false;
sMathCaps.MMX = false;
sMathCaps.SSE = false;
sMathCaps.SSE2 = false;
sMathCaps.SSE3 = false;
sMathCaps.SSSE3 = false;
sMathCaps.SSE41 = false;
sMathCaps.SSE42 = false;
sMathCaps.SSE4a = false;
sMathCaps.AVX = false;
sMathCaps.XOP = false;
sMathCaps.FMA3 = false;
sMathCaps.FMA4 = false;
int info[4];
cpuid(info, 0);
int nIds = info[0];
cpuid(info, 0x80000000);
int nExIds = info[0];
// Detect Instruction Set
if (nIds >= 1){
cpuid(info,0x00000001);
sMathCaps.MMX = (info[3] & ((int)1 << 23)) != 0;
sMathCaps.SSE = (info[3] & ((int)1 << 25)) != 0;
sMathCaps.SSE2 = (info[3] & ((int)1 << 26)) != 0;
sMathCaps.SSE3 = (info[2] & ((int)1 << 0)) != 0;
sMathCaps.SSSE3 = (info[2] & ((int)1 << 9)) != 0;
sMathCaps.SSE41 = (info[2] & ((int)1 << 19)) != 0;
sMathCaps.SSE42 = (info[2] & ((int)1 << 20)) != 0;
sMathCaps.AVX = (info[2] & ((int)1 << 28)) != 0;
sMathCaps.FMA3 = (info[2] & ((int)1 << 12)) != 0;
}
if (nExIds >= 0x80000001){
cpuid(info,0x80000001);
sMathCaps.x64 = (info[3] & ((int)1 << 29)) != 0;
sMathCaps.SSE4a = (info[2] & ((int)1 << 6)) != 0;
sMathCaps.FMA4 = (info[2] & ((int)1 << 16)) != 0;
sMathCaps.XOP = (info[2] & ((int)1 << 11)) != 0;
}
if(sMathCaps.SSE)
sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; // we are starting on.
}
return &sMathCaps;
};
void * malloc_simd(const size_t size)
{
#if defined WIN32 // WIN32
return _aligned_malloc(size, 16);
#elif defined __linux__ // Linux
return memalign (16, size);
#elif defined __MACH__ // Mac OS X
return malloc(size);
#else // other (use valloc for page-aligned memory)
return valloc(size);
#endif
}
void free_simd(void* mem)
{
#if defined WIN32 // WIN32
_aligned_free(mem);
#else
free(mem);
#endif
}
EffectEqualization48x::EffectEqualization48x():
mThreadCount(0),mFilterSize(0),mWindowSize(0),mBlockSize(0),mWorkerDataCount(0),mBlocksPerBuffer(20),
mScratchBufferSize(0),mSubBufferSize(0),mBigBuffer(NULL),mBufferInfo(NULL),mEQWorkers(0),mThreaded(false),
mBenching(false),mBufferCount(0)
{
}
EffectEqualization48x::~EffectEqualization48x()
{
}
bool EffectEqualization48x::AllocateBuffersWorkers(int nThreads)
{
if(mBigBuffer)
FreeBuffersWorkers();
mFilterSize=(mEffectEqualization->mM-1)&(~15); // 4000 !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mWindowSize=mEffectEqualization->windowSize;
mBlockSize=mWindowSize-mFilterSize; // 12,384
mThreaded = (nThreads > 0 );
if(mThreaded)
{
mThreadCount=wxThread::GetCPUCount();
mWorkerDataCount=mThreadCount+2; // 2 extra slots (maybe double later)
} else {
mWorkerDataCount=1;
mThreadCount=0;
}
#ifdef __AVX_ENABLED
mBufferCount=sMathPath&MATH_FUNCTION_AVX?8:4;
#else
mBufferCount=4;
#endif
// we're skewing the data by one block to allow for 1/4 block intersections.
// this will remove the disparity in data at the intersections of the runs
// The nice magic allocation
// megabyte - 3 windows - 4 overlaping buffers - filter
// 2^20 = 1,048,576 - 3 * 2^14 (16,384) - ((4 * 20) - 3) * 12,384 - 4000
// 1,048,576 - 49,152 - 953,568 - 4000 = 41,856 (leftover)
mScratchBufferSize=mWindowSize*3*sizeof(float)*mBufferCount; // 3 window size blocks of instruction size
mSubBufferSize=mBlockSize*(mBufferCount*(mBlocksPerBuffer-1)); // we are going to do a full block overlap
mBigBuffer=(float *)malloc_simd(sizeof(float)*(mSubBufferSize+mFilterSize+mScratchBufferSize)*mWorkerDataCount); // we run over by filtersize
// fill the bufferInfo
mBufferInfo = new BufferInfo[mWorkerDataCount];
for(int i=0;i<mWorkerDataCount;i++) {
mBufferInfo[i].mFftWindowSize=mWindowSize;
mBufferInfo[i].mFftFilterSize=mFilterSize;
mBufferInfo[i].mBufferLength=mBlockSize*mBlocksPerBuffer;
mBufferInfo[i].mContiguousBufferSize=mSubBufferSize;
mBufferInfo[i].mScratchBuffer=&mBigBuffer[(mSubBufferSize+mScratchBufferSize)*i+mSubBufferSize];
for(int j=0;j<mBufferCount;j++)
mBufferInfo[i].mBufferDest[j]=mBufferInfo[i].mBufferSouce[j]=&mBigBuffer[j*(mBufferInfo[i].mBufferLength-mBlockSize)+(mSubBufferSize+mScratchBufferSize)*i];
}
if(mThreadCount) {
// start the workers
mDataMutex.IsOk();
mEQWorkers=new EQWorker[mThreadCount];
for(int i=0;i<mThreadCount;i++) {
mEQWorkers[i].SetData( mBufferInfo, mWorkerDataCount, &mDataMutex, this);
mEQWorkers[i].Create();
mEQWorkers[i].Run();
}
}
return true;
}
bool EffectEqualization48x::FreeBuffersWorkers()
{
if(mThreaded) {
for(int i=0;i<mThreadCount;i++) { // tell all the workers to exit
mEQWorkers[i].ExitLoop();
}
for(int i=0;i<mThreadCount;i++) {
mEQWorkers[i].Wait();
}
delete[] mEQWorkers; // kill the workers ( go directly to jail)
mEQWorkers= NULL;
mThreadCount=0;
mWorkerDataCount=0;
}
delete [] mBufferInfo;
mBufferInfo = NULL;
free_simd(mBigBuffer);
mBigBuffer=NULL;
return true;
}
#pragma warning(push)
// Disable the unreachable code warning in MSVC, for this function.
#pragma warning(disable: 4702)
bool EffectEqualization48x::RunFunctionSelect(int flags, int count, WaveTrack * track, sampleCount start, sampleCount len)
{
// deal with tables here
flags&=~(MATH_FUNCTION_BITREVERSE_TABLE|MATH_FUNCTION_SIN_COS_TABLE); // clear out the table flags
switch (flags)
{
case MATH_FUNCTION_SSE:
return ProcessOne4x(count, track, start, len);
break;
case MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED:
return ProcessOne1x4xThreaded(count, track, start, len);
break;
case MATH_FUNCTION_THREADED:
case MATH_FUNCTION_THREADED|MATH_FUNCTION_SEGMENTED_CODE:
return ProcessOne1x4xThreaded(count, track, start, len, 1);
break;
case MATH_FUNCTION_SEGMENTED_CODE:
return ProcessOne1x(count, track, start, len);
break;
default:
return !mEffectEqualization->ProcessOne(count, track, start, len);
break;
}
return false;
}
#pragma warning(pop)
bool EffectEqualization48x::Process(EffectEqualization* effectEqualization)
{
mEffectEqualization=effectEqualization;
// return TrackCompare(); // used for debugging data
mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
bool bBreakLoop = false;
TableUsage(sMathPath);
if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
AllocateBuffersWorkers(sMathPath&MATH_FUNCTION_THREADED);
SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
WaveTrack *track = (WaveTrack *) iter.First();
int count = 0;
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
bBreakLoop=RunFunctionSelect(sMathPath, count, track, start, len);
if( bBreakLoop )
break;
}
track = (WaveTrack *) iter.Next();
count++;
}
FreeBuffersWorkers();
mEffectEqualization->ReplaceProcessedTracks(!bBreakLoop);
return !bBreakLoop;
}
bool EffectEqualization48x::TrackCompare()
{
mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
bool bBreakLoop = false;
TableUsage(sMathPath);
if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
AllocateBuffersWorkers(sMathPath&MATH_FUNCTION_THREADED);
// Reset map
wxArrayPtrVoid SecondIMap;
wxArrayPtrVoid SecondOMap;
SecondIMap.Clear();
SecondOMap.Clear();
TrackList *SecondOutputTracks = new TrackList();
//iterate over tracks of type trackType (All types if Track::All)
TrackListOfKindIterator aIt(mEffectEqualization->mOutputTracksType, mEffectEqualization->mTracks);
for (Track *aTrack = aIt.First(); aTrack; aTrack = aIt.Next()) {
// Include selected tracks, plus sync-lock selected tracks for Track::All.
if (aTrack->GetSelected() ||
(mEffectEqualization->mOutputTracksType == Track::All && aTrack->IsSyncLockSelected()))
{
Track *o = aTrack->Duplicate();
SecondOutputTracks->Add(o);
SecondIMap.Add(aTrack);
SecondIMap.Add(o);
}
}
for(int i=0;i<2;i++) {
SelectedTrackListOfKindIterator iter(Track::Wave, i?mEffectEqualization->mOutputTracks:SecondOutputTracks);
i?sMathPath=sMathPath:sMathPath=0;
WaveTrack *track = (WaveTrack *) iter.First();
int count = 0;
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
bBreakLoop=RunFunctionSelect(sMathPath, count, track, start, len);
if( bBreakLoop )
break;
}
track = (WaveTrack *) iter.Next();
count++;
}
}
SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
SelectedTrackListOfKindIterator iter2(Track::Wave, SecondOutputTracks);
WaveTrack *track = (WaveTrack *) iter.First();
WaveTrack *track2 = (WaveTrack *) iter2.First();
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
DeltaTrack(track, track2, start, len);
}
track = (WaveTrack *) iter.Next();
track2 = (WaveTrack *) iter2.Next();
}
delete SecondOutputTracks;
FreeBuffersWorkers();
mEffectEqualization->ReplaceProcessedTracks(!bBreakLoop);
return bBreakLoop;
}
bool EffectEqualization48x::DeltaTrack(WaveTrack * t, WaveTrack * t2, sampleCount start, sampleCount len)
{
sampleCount trackBlockSize = t->GetMaxBlockSize();
float *buffer1 = new float[trackBlockSize];
float *buffer2 = new float[trackBlockSize];
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
sampleCount originalLen = len;
sampleCount currentSample = start;
while(len) {
sampleCount curretLength=(trackBlockSize>len)?len:trackBlockSize;
t->Get((samplePtr)buffer1, floatSample, currentSample, curretLength);
t2->Get((samplePtr)buffer2, floatSample, currentSample, curretLength);
for(int i=0;i<curretLength;i++)
buffer1[i]-=buffer2[i];
output->Append((samplePtr)buffer1, floatSample, curretLength);
currentSample+=curretLength;
len-=curretLength;
}
delete[] buffer1;
delete[] buffer2;
output->Flush();
len=originalLen;
ProcessTail(t, output, start, len);
delete output;
return true;
}
bool EffectEqualization48x::Benchmark(EffectEqualization* effectEqualization)
{
mEffectEqualization=effectEqualization;
mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
bool bBreakLoop = false;
TableUsage(sMathPath);
if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
AllocateBuffersWorkers(MATH_FUNCTION_THREADED);
SelectedTrackListOfKindIterator iter(Track::Wave, mEffectEqualization->mOutputTracks);
long times[] = { 0,0,0,0,0 };
wxStopWatch timer;
mBenching=true;
for(int i=0;i<5 && !bBreakLoop;i++) {
int localMathPath;
switch(i) {
case 0: localMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
if(!sMathCaps.SSE)
localMathPath=-1;
break;
case 1: localMathPath=MATH_FUNCTION_SSE;
if(!sMathCaps.SSE)
localMathPath=-1;
break;
case 2: localMathPath=MATH_FUNCTION_SEGMENTED_CODE;
break;
case 3: localMathPath=MATH_FUNCTION_THREADED|MATH_FUNCTION_SEGMENTED_CODE;
break;
case 4: localMathPath=0;
break;
default: localMathPath=-1;
}
if(localMathPath>=0) {
timer.Start();
WaveTrack *track = (WaveTrack *) iter.First();
int count = 0;
while (track) {
double trackStart = track->GetStartTime();
double trackEnd = track->GetEndTime();
double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
if (t1 > t0) {
sampleCount start = track->TimeToLongSamples(t0);
sampleCount end = track->TimeToLongSamples(t1);
sampleCount len = (sampleCount)(end - start);
bBreakLoop=RunFunctionSelect( localMathPath, count, track, start, len);
if( bBreakLoop )
break;
}
track = (WaveTrack *) iter.Next();
count++;
}
times[i]=timer.Time();
}
}
FreeBuffersWorkers();
mBenching=false;
bBreakLoop=false;
mEffectEqualization->ReplaceProcessedTracks(bBreakLoop);
wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]);
wxTimeSpan tsSSE(0, 0, 0, times[1]);
wxTimeSpan tsDefaultEnhanced(0, 0, 0, times[2]);
wxTimeSpan tsDefaultThreaded(0, 0, 0, times[3]);
wxTimeSpan tsDefault(0, 0, 0, times[4]);
wxMessageBox(wxString::Format(_("Benchmark times:\nOriginal: %s\nDefault Segmented: %s\nDefault Threaded: %s\nSSE: %s\nSSE Threaded: %s\n"),tsDefault.Format(wxT("%M:%S.%l")).c_str(),
tsDefaultEnhanced.Format(wxT("%M:%S.%l")).c_str(), tsDefaultThreaded.Format(wxT("%M:%S.%l")).c_str(),tsSSE.Format(wxT("%M:%S.%l")).c_str(),tsSSEThreaded.Format(wxT("%M:%S.%l")).c_str()));
return bBreakLoop;
}
bool EffectEqualization48x::ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len)
{
// double offsetT0 = t->LongSamplesToTime((sampleCount)offset);
double lenT = t->LongSamplesToTime(len);
// 'start' is the sample offset in 't', the passed in track
// 'startT' is the equivalent time value
// 'output' starts at zero
double startT = t->LongSamplesToTime(start);
//output has one waveclip for the total length, even though
//t might have whitespace seperating multiple clips
//we want to maintain the original clip structure, so
//only paste the intersections of the new clip.
//Find the bits of clips that need replacing
std::vector<std::pair<double, double> > clipStartEndTimes;
std::vector<std::pair<double, double> > clipRealStartEndTimes; //the above may be truncated due to a clip being partially selected
for (WaveClipList::compatibility_iterator it=t->GetClipIterator(); it; it=it->GetNext())
{
WaveClip *clip;
double clipStartT;
double clipEndT;
clip = it->GetData();
clipStartT = clip->GetStartTime();
clipEndT = clip->GetEndTime();
if( clipEndT <= startT )
continue; // clip is not within selection
if( clipStartT >= startT + lenT )
continue; // clip is not within selection
//save the actual clip start/end so that we can rejoin them after we paste.
clipRealStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));
if( clipStartT < startT ) // does selection cover the whole clip?
clipStartT = startT; // don't copy all the new clip
if( clipEndT > startT + lenT ) // does selection cover the whole clip?
clipEndT = startT + lenT; // don't copy all the new clip
//save them
clipStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));
}
//now go thru and replace the old clips with new
for(unsigned int i=0;i<clipStartEndTimes.size();i++)
{
Track *toClipOutput;
//remove the old audio and get the new
t->Clear(clipStartEndTimes[i].first,clipStartEndTimes[i].second);
// output->Copy(clipStartEndTimes[i].first-startT+offsetT0,clipStartEndTimes[i].second-startT+offsetT0, &toClipOutput);
output->Copy(clipStartEndTimes[i].first-startT,clipStartEndTimes[i].second-startT, &toClipOutput);
if(toClipOutput)
{
//put the processed audio in
bool bResult = t->Paste(clipStartEndTimes[i].first, toClipOutput);
wxASSERT(bResult); // TO DO: Actually handle this.
//if the clip was only partially selected, the Paste will have created a split line. Join is needed to take care of this
//This is not true when the selection is fully contained within one clip (second half of conditional)
if( (clipRealStartEndTimes[i].first != clipStartEndTimes[i].first ||
clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) &&
!(clipRealStartEndTimes[i].first <= startT &&
clipRealStartEndTimes[i].second >= startT+lenT) )
t->Join(clipRealStartEndTimes[i].first,clipRealStartEndTimes[i].second);
delete toClipOutput;
}
}
return true;
}
bool EffectEqualization48x::ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, sampleCount bufferLength)
{
BufferInfo bufferInfo;
bufferInfo.mContiguousBufferSize=bufferLength;
bufferInfo.mBufferSouce[0]=sourceBuffer;
bufferInfo.mBufferDest[0]=destBuffer;
bufferInfo.mScratchBuffer=&sourceBuffer[mSubBufferSize];
return ProcessBuffer1x(&bufferInfo);
}
bool EffectEqualization48x::ProcessBuffer1x(BufferInfo *bufferInfo)
{
int bufferCount=bufferInfo->mContiguousBufferSize?1:4;
for(int bufferIndex=0;bufferIndex<bufferCount;bufferIndex++)
{
int bufferLength=bufferInfo->mBufferLength;
if(bufferInfo->mContiguousBufferSize)
bufferLength=bufferInfo->mContiguousBufferSize;
sampleCount blockCount=bufferLength/mBlockSize;
sampleCount lastBlockSize=bufferLength%mBlockSize;
if(lastBlockSize)
blockCount++;
float *workBuffer=bufferInfo->mScratchBuffer; // all scratch buffers are at the end
float *scratchBuffer=&workBuffer[mWindowSize*2]; // all scratch buffers are at the end
float *sourceBuffer=bufferInfo->mBufferSouce[bufferIndex];
float *destBuffer=bufferInfo->mBufferDest[bufferIndex];
for(int runx=0;runx<blockCount;runx++)
{
float *currentBuffer=&workBuffer[mWindowSize*(runx&1)];
for(int i=0;i<mBlockSize;i++)
currentBuffer[i]=sourceBuffer[i];
sourceBuffer+=mBlockSize;
float *currentFilter=&currentBuffer[mBlockSize];
for(int i=0;i<mFilterSize;i++)
currentFilter[i]=0;
// mEffectEqualization->Filter(mWindowSize, currentBuffer);
Filter1x(mWindowSize, currentBuffer, scratchBuffer);
float *writeEnd=currentBuffer+mBlockSize;
if(runx==blockCount)
writeEnd=currentBuffer+(lastBlockSize+mFilterSize);
if(runx) {
float *lastOverrun=&workBuffer[mWindowSize*((runx+1)&1)+mBlockSize];
for(int j=0;j<mFilterSize;j++)
*destBuffer++= *currentBuffer++ + *lastOverrun++;
} else
currentBuffer+=mFilterSize>>1; // this will skip the first filterSize on the first run
while(currentBuffer<writeEnd)
*destBuffer++ = *currentBuffer++;
}
}
return true;
}
bool EffectEqualization48x::ProcessOne1x(int count, WaveTrack * t,
sampleCount start, sampleCount len)
{
//sampleCount blockCount=len/mBlockSize;
sampleCount trackBlockSize = t->GetMaxBlockSize();
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
mEffectEqualization->TrackProgress(count, 0.0);
int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active
int bigRuns=len/(subBufferSize-mBlockSize);
int trackBlocksPerBig=subBufferSize/trackBlockSize;
int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize;
int singleProcessLength;
if(!bigRuns)
singleProcessLength=len;
else
singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize));
sampleCount currentSample=start;
bool bBreakLoop = false;
for(int bigRun=0;bigRun<bigRuns;bigRun++)
{
// fill the buffer
for(int i=0;i<trackBlocksPerBig;i++) {
t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
ProcessBuffer1x(mBufferInfo);
bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns);
if( bBreakLoop )
break;
output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
}
if(singleProcessLength && !bBreakLoop) {
t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
output->Append((samplePtr)&mBigBuffer[bigRuns?mBlockSize:0], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
}
output->Flush();
if(!bBreakLoop)
ProcessTail(t, output, start, len);
delete output;
return bBreakLoop;
}
void EffectEqualization48x::Filter1x(sampleCount len,
float *buffer, float *scratchBuffer)
{
int i;
float real, imag;
// Apply FFT
RealFFTf1x(buffer, mEffectEqualization->hFFT);
// Apply filter
// DC component is purely real
float filterFuncR, filterFuncI;
filterFuncR=mEffectEqualization->mFilterFuncR[0];
scratchBuffer[0]=buffer[0]*filterFuncR;
int halfLength=(len/2);
bool useBitReverseTable=sMathPath&1;
for(i=1; i<halfLength; i++)
{
if(useBitReverseTable) {
real=buffer[mEffectEqualization->hFFT->BitReversed[i] ];
imag=buffer[mEffectEqualization->hFFT->BitReversed[i]+1];
} else {
int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits);
real=buffer[bitReversed];
imag=buffer[bitReversed+1];
}
filterFuncR=mEffectEqualization->mFilterFuncR[i];
filterFuncI=mEffectEqualization->mFilterFuncI[i];
scratchBuffer[2*i ] = real*filterFuncR - imag*filterFuncI;
scratchBuffer[2*i+1] = real*filterFuncI + imag*filterFuncR;
}
// Fs/2 component is purely real
filterFuncR=mEffectEqualization->mFilterFuncR[halfLength];
scratchBuffer[1] = buffer[1] * filterFuncR;
// Inverse FFT and normalization
InverseRealFFTf1x(scratchBuffer, mEffectEqualization->hFFT);
ReorderToTime1x(mEffectEqualization->hFFT, scratchBuffer, buffer);
}
bool EffectEqualization48x::ProcessBuffer4x(BufferInfo *bufferInfo)
{
// length must be a factor of window size for 4x processing.
if(bufferInfo->mBufferLength%mBlockSize)
return false;
sampleCount blockCount=bufferInfo->mBufferLength/mBlockSize;
__m128 *readBlocks[4]; // some temps so we dont destroy the vars in the struct
__m128 *writeBlocks[4];
for(int i=0;i<4;i++) {
readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i];
writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i];
}
__m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer;
__m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*2];
for(int run4x=0;run4x<blockCount;run4x++)
{
// swizzle the data to the swizzle buffer
__m128 *currentSwizzledBlock=&swizzledBuffer128[mWindowSize*(run4x&1)];
for(int i=0,j=0;j<mBlockSize;i++,j+=4) {
__m128 tmp0 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(1,0,1,0));
__m128 tmp1 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(3,2,3,2));
__m128 tmp2 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(1,0,1,0));
__m128 tmp3 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(3,2,3,2));
currentSwizzledBlock[j] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+1] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1));
currentSwizzledBlock[j+2] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+3] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1));
}
__m128 *thisOverrun128=&currentSwizzledBlock[mBlockSize];
for(int i=0;i<mFilterSize;i++)
thisOverrun128[i]=_mm_set1_ps(0.0);
Filter4x(mWindowSize, (float *)currentSwizzledBlock, (float *)scratchBuffer);
int writeStart=0, writeToStart=0; // note readStart is where the read data is written
int writeEnd=mBlockSize;
if(run4x) {
// maybe later swizzle add and write in one
__m128 *lastOverrun128=&swizzledBuffer128[mWindowSize*((run4x+1)&1)+mBlockSize];
// add and swizzle data + filter
for(int i=0,j=0;j<mFilterSize;i++,j+=4) {
__m128 tmps0 = _mm_add_ps(currentSwizzledBlock[j], lastOverrun128[j]);
__m128 tmps1 = _mm_add_ps(currentSwizzledBlock[j+1], lastOverrun128[j+1]);
__m128 tmps2 = _mm_add_ps(currentSwizzledBlock[j+2], lastOverrun128[j+2]);
__m128 tmps3 = _mm_add_ps(currentSwizzledBlock[j+3], lastOverrun128[j+3]);
__m128 tmp0 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1));
__m128 tmp1 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3));
__m128 tmp2 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1));
__m128 tmp3 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
}
writeStart=mFilterSize;
writeToStart=mFilterSize>>2;
// swizzle it back.
for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
__m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
__m128 tmp1 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(2,3,2,3));
__m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1));
__m128 tmp3 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(2,3,2,3));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
}
} else {
// swizzle it back. We overlap one block so we only write the first block on the first run
writeStart=0;
writeToStart=0;
for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
__m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
__m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
}
}
for(int i=0;i<4;i++) { // shift each block
readBlocks[i]+=mBlockSize>>2; // these are 128b pointers, each window is 1/4 blockSize for those
writeBlocks[i]+=mBlockSize>>2;
}
}
return true;
}
bool EffectEqualization48x::ProcessOne4x(int count, WaveTrack * t,
sampleCount start, sampleCount len)
{
int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active
if(len<subBufferSize) // it's not worth 4x processing do a regular process
return ProcessOne1x(count, t, start, len);
sampleCount trackBlockSize = t->GetMaxBlockSize();
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
mEffectEqualization->TrackProgress(count, 0.0);
int bigRuns=len/(subBufferSize-mBlockSize);
int trackBlocksPerBig=subBufferSize/trackBlockSize;
int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize;
int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize));
sampleCount currentSample=start;
bool bBreakLoop = false;
for(int bigRun=0;bigRun<bigRuns;bigRun++)
{
// fill the buffer
for(int i=0;i<trackBlocksPerBig;i++) {
t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
ProcessBuffer4x(mBufferInfo);
bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns);
if( bBreakLoop )
break;
output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
}
if(singleProcessLength && !bBreakLoop) {
t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
output->Append((samplePtr)&mBigBuffer[bigRuns?mBlockSize:0], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
// output->Append((samplePtr)&mBigBuffer[bigRuns?mBlockSize:0], floatSample, singleProcessLength);
}
output->Flush();
if(!bBreakLoop)
ProcessTail(t, output, start, len);
delete output;
return bBreakLoop;
}
void *EQWorker::Entry()
{
while(!mExitLoop) {
mMutex->Lock();
bool bufferAquired=false;
for(int i=0;i<mBufferInfoCount;i++)
if(mBufferInfoList[i].mBufferStatus==BufferReady) { // we found an unlocked ready buffer
bufferAquired=true;
mBufferInfoList[i].mBufferStatus=BufferBusy; // we own it now
mMutex->Unlock();
switch (mProcessingType)
{
case 1:
mEffectEqualization48x->ProcessBuffer1x(&mBufferInfoList[i]);
break;
case 4:
mEffectEqualization48x->ProcessBuffer4x(&mBufferInfoList[i]);
break;
}
mBufferInfoList[i].mBufferStatus=BufferDone; // we're done
break;
}
if(!bufferAquired)
mMutex->Unlock();
}
return NULL;
}
bool EffectEqualization48x::ProcessOne1x4xThreaded(int count, WaveTrack * t,
sampleCount start, sampleCount len, int processingType)
{
int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active
sampleCount blockCount=len/mBlockSize;
if(blockCount<16) // it's not worth 4x processing do a regular process
return ProcessOne4x(count, t, start, len);
if(mThreadCount<=0 || blockCount<256) // dont do it without cores or big data
return ProcessOne4x(count, t, start, len);
for(int i=0;i<mThreadCount;i++)
mEQWorkers[i].mProcessingType=processingType;
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
sampleCount trackBlockSize = t->GetMaxBlockSize();
mEffectEqualization->TrackProgress(count, 0.0);
int bigRuns=len/(subBufferSize-mBlockSize);
int trackBlocksPerBig=subBufferSize/trackBlockSize;
int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize;
int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize));
sampleCount currentSample=start;
int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0;
// fill the first workerDataCount buffers we checked above and there is at least this data
int maxPreFill=bigRuns<mWorkerDataCount?bigRuns:mWorkerDataCount;
for(int i=0;i<maxPreFill;i++)
{
// fill the buffer
for(int j=0;j<trackBlocksPerBig;j++) {
t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin
}
int currentIndex=0;
bool bBreakLoop = false;
while(bigBlocksWritten<bigRuns && !bBreakLoop) {
bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns);
if( bBreakLoop )
break;
mDataMutex.Lock(); // Get in line for data
// process as many blocks as we can
while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWritten<bigRuns)) { // data is ours
output->Append((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)));
bigBlocksWritten++;
if(bigBlocksRead<bigRuns) {
// fill the buffer
for(int j=0;j<trackBlocksPerBig;j++) {
t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin
bigBlocksRead++;
} else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unecessary
currentIndex=(currentIndex+1)%mWorkerDataCount;
}
mDataMutex.Unlock(); // Get back in line for data
}
if(singleProcessLength && !bBreakLoop) {
t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
}
output->Flush();
if(!bBreakLoop)
ProcessTail(t, output, start, len);
delete output;
return bBreakLoop;
}
void EffectEqualization48x::Filter4x(sampleCount len,
float *buffer, float *scratchBuffer)
{
int i;
__m128 real128, imag128;
// Apply FFT
RealFFTf4x(buffer, mEffectEqualization->hFFT);
// Apply filter
// DC component is purely real
__m128 *localFFTBuffer=(__m128 *)scratchBuffer;
__m128 *localBuffer=(__m128 *)buffer;
__m128 filterFuncR, filterFuncI;
filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[0]);
localFFTBuffer[0]=_mm_mul_ps(localBuffer[0], filterFuncR);
int halfLength=(len/2);
bool useBitReverseTable=sMathPath&1;
for(i=1; i<halfLength; i++)
{
if(useBitReverseTable) {
real128=localBuffer[mEffectEqualization->hFFT->BitReversed[i] ];
imag128=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1];
} else {
int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits);
real128=localBuffer[bitReversed];
imag128=localBuffer[bitReversed+1];
}
filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[i]);
filterFuncI=_mm_set1_ps(mEffectEqualization->mFilterFuncI[i]);
localFFTBuffer[2*i ] = _mm_sub_ps( _mm_mul_ps(real128, filterFuncR), _mm_mul_ps(imag128, filterFuncI));
localFFTBuffer[2*i+1] = _mm_add_ps( _mm_mul_ps(real128, filterFuncI), _mm_mul_ps(imag128, filterFuncR));
}
// Fs/2 component is purely real
filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]);
localFFTBuffer[1] = _mm_mul_ps(localBuffer[1], filterFuncR);
// Inverse FFT and normalization
InverseRealFFTf4x(scratchBuffer, mEffectEqualization->hFFT);
ReorderToTime4x(mEffectEqualization->hFFT, scratchBuffer, buffer);
}
#ifdef __AVX_ENABLED
// note although written it has not been tested
bool EffectEqualization48x::ProcessBuffer8x(BufferInfo *bufferInfo)
{
// length must be a factor of window size for 4x processing.
if(bufferInfo->mBufferLength%mBlockSize || mBufferCount!=8)
return false;
sampleCount blockCount=bufferInfo->mBufferLength/mBlockSize;
__m128 *readBlocks[8]; // some temps so we dont destroy the vars in the struct
__m128 *writeBlocks[8];
for(int i=0;i<8;i++) {
readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i];
writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i];
}
__m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer;
__m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*4];
int doubleFilter=mFilterSize<<1;
int doubleWindow=mWindowSize<<1;
int doubleBlock=mBlockSize<<1;
for(int run4x=0;run4x<blockCount;run4x++)
{
// swizzle the data to the swizzle buffer
__m128 *currentSwizzledBlock=&swizzledBuffer128[doubleWindow*(run4x&1)];
for(int i=0,j=0;j<doubleBlock;i++,j+=8) { // mBlockSize or doubleBlock???
__m128 tmp0 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(1,0,1,0));
__m128 tmp1 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(3,2,3,2));
__m128 tmp2 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(1,0,1,0));
__m128 tmp3 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(3,2,3,2));
currentSwizzledBlock[j] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+2] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1));
currentSwizzledBlock[j+4] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+6] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1));
tmp0 = _mm_shuffle_ps(readBlocks[4][i], readBlocks[5][i], _MM_SHUFFLE(1,0,1,0));
tmp1 = _mm_shuffle_ps(readBlocks[4][i], readBlocks[5][i], _MM_SHUFFLE(3,2,3,2));
tmp2 = _mm_shuffle_ps(readBlocks[6][i], readBlocks[7][i], _MM_SHUFFLE(1,0,1,0));
tmp3 = _mm_shuffle_ps(readBlocks[6][i], readBlocks[7][i], _MM_SHUFFLE(3,2,3,2));
currentSwizzledBlock[j+1] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+3] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1));
currentSwizzledBlock[j+5] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0));
currentSwizzledBlock[j+7] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1));
}
__m128 *thisOverrun128=&currentSwizzledBlock[doubleBlock];
for(int i=0;i<doubleFilter;i++)
thisOverrun128[i]=_mm_set1_ps(0.0);
Filter8x(mWindowSize, (float *)currentSwizzledBlock, (float *)scratchBuffer);
int writeStart=0, writeToStart=0; // note readStart is where the read data is written
int writeEnd=doubleBlock;
if(run4x) {
// maybe later swizzle add and write in one
__m128 *lastOverrun128=&swizzledBuffer128[doubleWindow*((run4x+1)&1)+doubleBlock];
// add and swizzle data + filter
for(int i=0,j=0;j<doubleFilter;i++,j+=8) {
__m128 tmps0 = _mm_add_ps(currentSwizzledBlock[j], lastOverrun128[j]);
__m128 tmps1 = _mm_add_ps(currentSwizzledBlock[j+2], lastOverrun128[j+2]);
__m128 tmps2 = _mm_add_ps(currentSwizzledBlock[j+4], lastOverrun128[j+4]);
__m128 tmps3 = _mm_add_ps(currentSwizzledBlock[j+6], lastOverrun128[j+6]);
__m128 tmp0 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1));
__m128 tmp1 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3));
__m128 tmp2 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1));
__m128 tmp3 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
tmps0 = _mm_add_ps(currentSwizzledBlock[j+1], lastOverrun128[j+1]);
tmps1 = _mm_add_ps(currentSwizzledBlock[j+3], lastOverrun128[j+3]);
tmps2 = _mm_add_ps(currentSwizzledBlock[j+5], lastOverrun128[j+5]);
tmps3 = _mm_add_ps(currentSwizzledBlock[j+7], lastOverrun128[j+7]);
tmp0 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1));
tmp1 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3));
tmp2 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1));
tmp3 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3));
writeBlocks[4][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[5][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[6][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[7][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
}
writeStart=doubleFilter;
writeToStart=mFilterSize>>2;
// swizzle it back.
for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=8) {
__m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+2], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
__m128 tmp1 = _mm_shuffle_ps(currentSwizzledBlock[j+2], currentSwizzledBlock[j], _MM_SHUFFLE(2,3,2,3));
__m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+6], currentSwizzledBlock[j+4], _MM_SHUFFLE(0,1,0,1));
__m128 tmp3 = _mm_shuffle_ps(currentSwizzledBlock[j+6], currentSwizzledBlock[j+4], _MM_SHUFFLE(2,3,2,3));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+1], _MM_SHUFFLE(0,1,0,1));
tmp1 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+1], _MM_SHUFFLE(2,3,2,3));
tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+7], currentSwizzledBlock[j+5], _MM_SHUFFLE(0,1,0,1));
tmp3 = _mm_shuffle_ps(currentSwizzledBlock[j+7], currentSwizzledBlock[j+5], _MM_SHUFFLE(2,3,2,3));
writeBlocks[4][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
writeBlocks[5][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
writeBlocks[6][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
writeBlocks[7][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
}
} else {
// swizzle it back. We overlap one block so we only write the first block on the first run
writeStart=0;
writeToStart=0;
for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=8) {
__m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+2], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
__m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+6], currentSwizzledBlock[j+4], _MM_SHUFFLE(0,1,0,1));
writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
}
}
for(int i=0;i<8;i++) { // shift each block
readBlocks[i]+=mBlockSize>>2; // these are 128b pointers, each window is 1/4 blockSize for those
writeBlocks[i]+=mBlockSize>>2;
}
}
return true;
}
bool EffectEqualization48x::ProcessOne8x(int count, WaveTrack * t,
sampleCount start, sampleCount len)
{
sampleCount blockCount=len/mBlockSize;
if(blockCount<32) // it's not worth 8x processing do a regular process
return ProcessOne4x(count, t, start, len);
sampleCount trackBlockSize = t->GetMaxBlockSize();
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
mEffectEqualization->TrackProgress(count, 0.0);
int bigRuns=len/(mSubBufferSize-mBlockSize);
int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
sampleCount currentSample=start;
bool bBreakLoop = false;
for(int bigRun=0;bigRun<bigRuns;bigRun++)
{
// fill the buffer
for(int i=0;i<trackBlocksPerBig;i++) {
t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
ProcessBuffer4x(mBufferInfo);
if (bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns))
{
break;
}
output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
}
if(singleProcessLength && !bBreakLoop) {
t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
}
output->Flush();
if(!bBreakLoop)
ProcessTail(t, output, start, len);
delete output;
return bBreakLoop;
}
bool EffectEqualization48x::ProcessOne8xThreaded(int count, WaveTrack * t,
sampleCount start, sampleCount len)
{
sampleCount blockCount=len/mBlockSize;
if(blockCount<16) // it's not worth 4x processing do a regular process
return ProcessOne4x(count, t, start, len);
if(mThreadCount<=0 || blockCount<256) // dont do it without cores or big data
return ProcessOne4x(count, t, start, len);
AudacityProject *p = GetActiveProject();
WaveTrack *output=p->GetTrackFactory()->NewWaveTrack(floatSample, t->GetRate());
sampleCount trackBlockSize = t->GetMaxBlockSize();
mEffectEqualization->TrackProgress(count, 0.0);
int bigRuns=len/(mSubBufferSize-mBlockSize);
int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
sampleCount currentSample=start;
int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0;
// fill the first workerDataCount buffers we checked above and there is at least this data
for(int i=0;i<mWorkerDataCount;i++)
{
// fill the buffer
for(int j=0;j<trackBlocksPerBig;j++) {
t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin
}
int currentIndex=0;
bool bBreakLoop = false;
while(bigBlocksWritten<bigRuns) {
if (bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns))
{
break;
}
mDataMutex.Lock(); // Get in line for data
// process as many blocks as we can
while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWritten<bigRuns)) { // data is ours
output->Append((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)));
bigBlocksWritten++;
if(bigBlocksRead<bigRuns) {
// fill the buffer
for(int j=0;j<trackBlocksPerBig;j++) {
t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
currentSample+=trackBlockSize;
}
if(trackLeftovers) {
t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
currentSample+=trackLeftovers;
}
currentSample-=mBlockSize+(mFilterSize>>1);
mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin
bigBlocksRead++;
} else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unecessary
currentIndex=(currentIndex+1)%mWorkerDataCount;
}
mDataMutex.Unlock(); // Get back in line for data
}
if(singleProcessLength && !bBreakLoop) {
t->Get((samplePtr)mBigBuffer, floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
ProcessBuffer(mBigBuffer, mBigBuffer, singleProcessLength+mBlockSize+(mFilterSize>>1));
output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
}
output->Flush();
if(!bBreakLoop)
ProcessTail(t, output, start, len);
delete output;
return bBreakLoop;
}
void EffectEqualization48x::Filter8x(sampleCount len,
float *buffer, float *scratchBuffer)
{
int i;
__m256 real256, imag256;
// Apply FFT
RealFFTf8x(buffer, mEffectEqualization->hFFT);
// Apply filter
// DC component is purely real
__m256 *localFFTBuffer=(__m256 *)scratchBuffer;
__m256 *localBuffer=(__m256 *)buffer;
__m256 filterFuncR, filterFuncI;
filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[0]);
localFFTBuffer[0]=_mm256_mul_ps(localBuffer[0], filterFuncR);
int halfLength=(len/2);
bool useBitReverseTable=sMathPath&1;
for(i=1; i<halfLength; i++)
{
if(useBitReverseTable) {
real256=localBuffer[mEffectEqualization->hFFT->BitReversed[i] ];
imag256=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1];
} else {
int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits);
real256=localBuffer[bitReversed];
imag256=localBuffer[bitReversed+1];
}
filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[i]);
filterFuncI=_mm256_set1_ps(mEffectEqualization->mFilterFuncI[i]);
localFFTBuffer[2*i ] = _mm256_sub_ps( _mm256_mul_ps(real256, filterFuncR), _mm256_mul_ps(imag256, filterFuncI));
localFFTBuffer[2*i+1] = _mm256_add_ps( _mm256_mul_ps(real256, filterFuncI), _mm256_mul_ps(imag256, filterFuncR));
}
// Fs/2 component is purely real
filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]);
localFFTBuffer[1] = _mm256_mul_ps(localBuffer[1], filterFuncR);
// Inverse FFT and normalization
InverseRealFFTf8x(scratchBuffer, mEffectEqualization->hFFT);
ReorderToTime8x(mEffectEqualization->hFFT, scratchBuffer, buffer);
}
#endif
#endif