1
0
mirror of https://github.com/cookiengineer/audacity synced 2025-10-10 16:43:33 +02:00

Extensive changes to improve NoteTrack display and (some) editing, NoteTrack playback via MIDI, and Midi-to-Audio alignment.

This commit is contained in:
rbdannenberg
2010-09-18 21:02:36 +00:00
parent f6327602e8
commit a1f0e5ed5b
96 changed files with 5679 additions and 3566 deletions

View File

@@ -15,7 +15,12 @@ are estimated directly from pitch data without synthesis. A similarity matrix
is constructed and dynamic programming finds the lowest-cost path through the
matrix.
(some more details should be added here about handling boundaries)
The alignment can optionally skip the initial silence and final silence
frames in both files. The "best" path matches from the beginning times
(with or without silence) to the end of either sequence but not
necessarily to the end of both. In other words, the match will match
all of the first file to an initial segment of the second, or it will
match all of the second to an initial segment of the first.
Output includes a map from one version to the other. If one file is MIDI,
output also includes (1) an estimated transcript in ASCII format with time,
@@ -32,10 +37,15 @@ For Windows, open score-align.vcproj (probably out of date now -- please
Command line parameters:
scorealign [-<flags> [<period><windowsize><path> <smooth><trans> <midi>]]
scorealign [-<flags> [<period> <windowsize> <path> <smooth>
<trans> <midi> <beatmap> <image>]]
<file1> [<file2>]
specifying only <file1> simply transcribes MIDI in <file1> to
transcription.txt. Otherwise, align <file1> and <file2>.
Flags are all listed together, e.g. -hwrstm, followed by filenames
and arguments corresponding to the flags in the order the flags are
given. Do not try something like "-h 0.1 -w 0.25" Instead, use
"-hw 0.1 0.25". The flags are:
-h 0.25 indicates a frame period of 0.25 seconds
-w 0.25 indicates a window size of 0.25 seconds.
-r indicates filename to write raw alignment path to (default path.data)
@@ -44,6 +54,8 @@ scorealign [-<flags> [<period><windowsize><path> <smooth><trans> <midi>]]
(default is transcription.txt)
-m is filename to write the time aligned midi file (default is midi.mid)
-b is filename to write the time aligned beat times (default is beatmap.txt)
-i is filename to write an image of the distance matrix
(default is distance.pnm)
-o 2.0 indicates a smoothing window of 2.0s
-p 3.0 means pre-smooth with a 3s window
-x 6.0 indicates 6s line segment approximation
@@ -80,9 +92,9 @@ linear regression values. Next, a hill-climbing search is performed to
minimize the total distance along the path. This is like dynamic programming
except that each line spans many frames, so the resulting path is forced to
be fairly straight. Linear interpolation is used to estimate chroma distance
since the lines do always pass through integer frame locations. This approach
is probably good when the audio is known to have a steady tempo or be
performed with tempo changes that match those in the midi file.
since the lines do not always pass through integer frame locations. This
approach is probably good when the audio is known to have a steady tempo or
be performed with tempo changes that match those in the midi file.
Some notes on the software architecture of scorealign:

View File

@@ -0,0 +1,29 @@
/**********************************************************************
Audacity: A Digital Audio Editor
ScoreAlignParams.h
**********************************************************************/
#ifndef __AUDACITY_SCORE_ALIGN_PARAMS__
#define __AUDACITY_SCORE_ALIGN_PARAMS__
struct ScoreAlignParams {
double mFramePeriod;
double mWindowSize;
double mSilenceThreshold;
double mForceFinalAlignment;
double mIgnoreSilence;
double mPresmoothTime;
double mLineTime;
double mSmoothTime;
// information returned from score alignment:
int mStatus; // wxID_OK or not?
double mAudioStart;
double mAudioEnd;
double mMidiStart;
double mMidiEnd;
};
#endif

View File

@@ -6,6 +6,7 @@
#include "stdlib.h"
#include "audioreader.h"
#include "allegro.h"
#include "scorealign.h"
#include "scorealign-glue.h"
#include "audiomixerreader.h"
@@ -26,7 +27,7 @@ Audio_mixer_reader::Audio_mixer_reader(void *mixer_,
index = 0;
channels = chans;
sample_rate = srate;
total_frames = end_time * srate + 0.5 /* for rounding */;
total_frames = (long) (end_time * srate + 0.5 /* for rounding */);
}

View File

@@ -1,64 +1,15 @@
#include <math.h>
#include <fstream>
#include <algorithm>
#include "allegro.h"
#include "audioreader.h"
#include "scorealign.h"
#include "gen_chroma.h"
#include "comp_chroma.h"
using namespace std;
/* NORM_CHROMA
*
* This function normalizes the chroma for each frame of the
* chrom_energy to mean 0 and std. dev. 1. But if this is a
* "silent frame", set the 13th element to 1.
*/
void norm_chroma( int len, float *chrom_energy ) {
float avg = 0;
float dev = 0;
float sum = 0;
for( int i = 0; i < len; i++ ) {
/* Calculate avg for this frame */
sum = 0;
for ( int j = 0; j < 12; j++ )
sum += AREF2(chrom_energy, i, j);
avg = sum / 12.0;
/* Silence detection: */
float silence = 0.0F;
if (avg < SILENCE_THRESHOLD) { /* assume silent */
silence = 1.0F;
}
AREF2(chrom_energy, i, 12) = silence;
// printf("avg at %g: %g\n", i * 0.25, avg);
/* Normalize this frame to avg. 0 */
for ( int j = 0; j < 12; j++ )
AREF2(chrom_energy, i, j) -= avg;
/* Calculate std. dev. for this frame */
sum = 0;
for ( int j = 0; j < 12; j++ ) {
float x = AREF2(chrom_energy, i, j);
sum += x * x;
}
dev = sqrt( sum / 12.0 );
if (dev == 0.0) dev = 1.0F; /* don't divide by zero */
/* Normalize this frame to std. dev. 1*/
for ( int j = 0; j < 12; j++ )
AREF2(chrom_energy, i, j) /= dev;
}
}
/* Returns the minimum of two values */
double min2( double x, double y ) {
return (x < y ? x : y);
}
#define SILENCE_DISTANCE 16.0
/* GEN_DIST
*
@@ -66,27 +17,23 @@ double min2( double x, double y ) {
* and j in two chroma vectors for use with dynamic time warping of
* the chroma vectors.
*/
float gen_dist( int i, int j, float *chrom_energy1,
float *chrom_energy2 ) {
float sum = 0;
float MAX = 12.0;
if (AREF2(chrom_energy1, i, CHROMA_BIN_COUNT) !=
AREF2(chrom_energy2, j, CHROMA_BIN_COUNT)) {
//printf("gd%g ", SILENCE_DISTANCE); // print result
return SILENCE_DISTANCE;
}
/* Determine the distance between these vectors
chroma1[i] and chroma2[j] to return */
for (int k = 0; k < 12; k++) {
float x = AREF2(chrom_energy1, i, k);
float y = AREF2(chrom_energy2, j, k);
float diff = x - y;
sum += diff*diff ;
}
sum = min2( sqrt( sum ), MAX );
//printf("gd%g ", sum); // print the result
return sum;
float Scorealign::gen_dist(int i, int j)
{
const float MAX = 12.0;
assert(i < file0_frames);
assert(j < file1_frames);
float *cv0 = AREF1(chrom_energy0, i);
float *cv1 = AREF1(chrom_energy1, j);
if (cv0[CHROMA_BIN_COUNT] != cv1[CHROMA_BIN_COUNT]) {
// silent frames are a (large) constant distance from non-silent frames
return SILENCE_DISTANCE;
}
/* calculate the Euclidean distance between these vectors */
float sum = 0;
for (int k = 0; k < CHROMA_BIN_COUNT; k++) {
float diff = cv0[k] - cv1[k];
sum += diff * diff ;
}
// place a ceiling (MAX) on distance
return min(sqrt(sum), MAX);
}

View File

@@ -1,24 +1,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>
//#include <stdio.h>
//#include <stdlib.h>
//#include <string.h>
//#include <ctype.h>
//#include <math.h>
#define SILENCE_THRESHOLD 0.001
#define SILENCE_DISTANCE 16.0
/* NORM_CHROMA
*
* This function normalizes the chroma for each frame of the
* chrom_energy to mean 0 and std. dev. 1.
*/
void norm_chroma( int len, float *chrom_energy );
/* GEN_DIST
*
* This function generates the Euclidean distance for points i
* and j in two chroma vectors for use with dynamic time warping of
* the chroma vectors.
*/
float gen_dist(int i, int j, float *chrom_energy1,
float *chrom_energy2 );

View File

@@ -8,6 +8,7 @@
*/
#include "assert.h"
#include <math.h>
#include "comp_chroma.h"
#include "sautils.h"
// the following are needed to get Scorealign
@@ -48,9 +49,15 @@ void save_path(char *filename);
class Curvefit : public Hillclimb {
public:
Curvefit(Scorealign *sa_, bool verbose_) { sa = sa_; verbose = verbose_; }
Curvefit(Scorealign *sa_, bool verbose_) {
sa = sa_;
verbose = verbose_;
p1_cache = p2_cache = d_cache = x = NULL;
}
~Curvefit();
virtual double evaluate();
void setup(int n);
void set_step_size(double ss);
double *get_x() { return x; }
private:
Scorealign *sa;
@@ -101,35 +108,41 @@ void Curvefit::setup(int segments)
// number of parameters is greater than segments because the left
// col of segment i is parameter i, so the right col of
// the last segment == parameter[segments].
n = segments + 1;
parameters = ALLOC(double, n);
Hillclimb::setup(segments + 1);
p1_cache = ALLOC(double, n);
p2_cache = ALLOC(double, n);
d_cache = ALLOC(double, n);
x = ALLOC(double, n);
step_size = ALLOC(double, n);
min_param = ALLOC(double, n);
max_param = ALLOC(double, n);
int i;
// ideal frames per segment
float seg_length = ((float) (sa->file1_frames - 1)) / segments;
float seg_length = ((float) (sa->last_x - sa->first_x)) / segments;
for (i = 0; i < n; i++) { // initialize cache keys to garbage
p1_cache[i] = p2_cache[i] = -999999.99;
// initialize x values
x[i] = ROUND(i * seg_length);
x[i] = ROUND(sa->first_x + i * seg_length);
// now initialize parameters based on pathx/pathy/time_map
// time_map has y values for each x
parameters[i] = sa->time_map[(int) x[i]];
assert(parameters[i] >= 0);
if (verbose)
printf("initial x[%d] = %g, parameters[%d] = %g\n",
i, x[i], i, parameters[i]);
step_size[i] = 0.5;
min_param[i] = 0;
max_param[i] = sa->file2_frames - 1;
max_param[i] = sa->last_y;
}
}
Curvefit::~Curvefit()
{
if (p1_cache) FREE(p1_cache);
if (p2_cache) FREE(p2_cache);
if (d_cache) FREE(d_cache);
if (x) FREE(x);
}
// distance_rc -- look up or compute distance between chroma vectors
// at row, col in similarity matrix
//
@@ -142,7 +155,7 @@ void Curvefit::setup(int segments)
// Since distance can be computed relatively quickly, a better plan
// would be to cache values along the path. Here's a brief design
// (for the future, assuming this routine is actually a hot spot):
// Allocate a matrix that is, say, 20 x file1_frames to contain distances
// Allocate a matrix that is, say, 20 x file0_frames to contain distances
// that are +/- 10 frames from the path. Initialize cells to -1.
// Allocate an array of integer offsets of size file1_frames.
// Fill in the integer offsets with the column number (pathy) value of
@@ -157,7 +170,10 @@ void Curvefit::setup(int segments)
//
double Curvefit::distance_rc(int row, int col)
{
return gen_dist(row, col, sa->chrom_energy1, sa->chrom_energy2);
double dist = sa->gen_dist(row, col);
if (dist > 20) // DEBUGGING
printf("internal error");
return dist;
}
@@ -190,6 +206,7 @@ double Curvefit::compute_dist(int i)
double dx = x2 - x1, dy = y2 - y1;
double sum = 0;
int n;
assert(x1 >= 0 && x2 >= 0 && y1 >= 0 && y2 >= 0);
if (dx > dy) { // evauate at each x
n = (int) dx;
for (int x = (int) x1; x < x2; x++) {
@@ -204,14 +221,52 @@ double Curvefit::compute_dist(int i)
}
}
// normalize using line length: sum/n is average distance. Multiply
// avg. distance (cost per unit length) by length to get total cost:
// avg. distance (cost per unit length) by length to get total cost.
// Note: this gives an advantage to direct diagonal paths without bends
// because longer path lengths result in higher total cost. This also
// gives heigher weight to longer segments, although all segments are
// about the same length.
double rslt = sqrt(dx*dx + dy*dy) * sum / n;
// printf("compute_dist %d: x1 %g y1 %g x2 %g y2 %g sum %g rslt %g\n",
// i, x1, y1, x2, y2, sum, rslt);
if (rslt < 0 || rslt > 20 * n) { // DEBUGGING
printf("internal error");
}
return rslt;
}
void Curvefit::set_step_size(double ss)
{
for (int i = 0; i < n; i++) {
step_size[i] = ss;
}
}
static long curvefit_iterations;
// This is a callback from Hillclimb::optimize to report progress
// We can't know percentage completion because we don't know how
// many iterations it will take to converge, so we just report
// iterations. The SAProgress class assumes some number based
// on experience.
//
// Normally, the iterations parameter is a good indicator of work
// expended so far, but since we call Hillclimb::optimize twice
// (second time with a finer grid to search), ignore iterations
// and use curvefit_iterations, a global counter, instead. This
// assumes that curvefit_progress is called once for each iteration.
//
void curvefit_progress(void *cookie, int iterations, double best)
{
Scorealign *sa = (Scorealign *) cookie;
if (sa->progress) {
sa->progress->set_smoothing_progress(++curvefit_iterations);
}
}
void curve_fitting(Scorealign *sa, bool verbose)
{
if (verbose)
@@ -220,12 +275,17 @@ void curve_fitting(Scorealign *sa, bool verbose)
Curvefit curvefit(sa, verbose);
double *parameters;
double *x;
curvefit_iterations = 0;
// how many segments? About total time / line_time:
int segments =
(int) (0.5 + (sa->actual_frame_period_1 * sa->file1_frames) /
(int) (0.5 + (sa->actual_frame_period_0 * (sa->last_x - sa->first_x)) /
sa->line_time);
curvefit.setup(segments);
curvefit.optimize();
curvefit.optimize(&curvefit_progress, sa);
// further optimization with smaller step sizes:
// this step size will interpolate 0.25s frames down to 10ms
curvefit.set_step_size(0.04);
curvefit.optimize(&curvefit_progress, sa);
parameters = curvefit.get_parameters();
x = curvefit.get_x();
// now, rewrite pathx and pathy according to segments

View File

@@ -111,7 +111,7 @@ void FFT3(int NumSamples,
int i, j, k, n;
int BlockSize, BlockEnd;
float angle_numerator = 2.0 * M_PI;
float angle_numerator = float(2.0 * M_PI);
float tr, ti; /* temp real, temp imaginary */
if (!IsPowerOfTwo(NumSamples)) {
@@ -224,7 +224,7 @@ void RealFFT3(int NumSamples, float *RealIn, float *RealOut, float *ImagOut)
int Half = NumSamples / 2;
int i;
float theta = M_PI / Half;
float theta = float(M_PI / Half);
float *tmpReal = (float *) alloca(sizeof(float) * Half);
float *tmpImag = (float *) alloca(sizeof(float) * Half);
@@ -289,7 +289,7 @@ void PowerSpectrum3(int NumSamples, float *In, float *Out)
int Half = NumSamples / 2;
int i;
float theta = M_PI / Half;
float theta = float(M_PI / Half);
float *tmpReal = (float *) alloca(sizeof(float) * Half);;
float *tmpImag = (float *) alloca(sizeof(float) * Half);

View File

@@ -30,7 +30,6 @@ using namespace std;
// each row is one chroma vector,
// data is stored as an array of chroma vectors:
// vector 1, vector 2, ...
#define CHROM(row, column) AREF2((*chrom_energy), row, column)
float hz_to_step(float hz)
{
@@ -40,21 +39,19 @@ float hz_to_step(float hz)
/* GEN_MAGNITUDE
given the real and imaginary portions of a complex FFT function, compute
the magnitude of the fft bin.
given input of 2 arrays (inR and inI) of length n, takes the ith element
from each, squares them, sums them, takes the square root of the sum and
puts the output into the ith position in the array out.
NOTE: out should be length n
*/
void gen_Magnitude(float* inR,float* inI, int low, int hi, float* out)
void gen_Magnitude(float* inR, float* inI, int low, int hi, float* out)
{
int i;
for (i = low; i < hi; i++) {
float magVal = sqrt(inR[i] * inR[i] + inI[i] * inI[i]);
//printf(" %d: sqrt(%g^2+%g^2)=%g\n",i,inR[i],inI[i+1],magVal);
out[i]= magVal;
#ifdef SA_VERBOSE
if (i == 1000) printf("gen_Magnitude: %d %g\n", i, magVal);
if (i == 1000) fprintf(dbf, "gen_Magnitude: %d %g\n", i, magVal);
#endif
}
}
@@ -116,17 +113,12 @@ int min_Bin_Num(float* bins, int numBins){
applies the hamming function to each sample.
n specifies the length of in and out.
*/
void gen_Hamming(float* in, int n, float* out)
void gen_Hamming(float* h, int n)
{
int k = 0;
for(k = 0; k < n; k++) {
float internalValue = 2.0 * M_PI * k * (1.0 / (n - 1));
float cosValue = cos(internalValue);
float hammingValue = 0.54F + (-0.46F * cosValue);
#ifdef SA_VERBOSE
if (k == 1000) printf("Hamming %g\n", hammingValue);
#endif
out[k] = hammingValue * in[k];
int k;
for (k = 0; k < n; k++) {
float cos_value = (float) cos(2.0 * M_PI * k * (1.0 / n));
h[k] = 0.54F + (-0.46F * cos_value);
}
}
@@ -142,6 +134,36 @@ int nextPowerOf2(int n)
}
// normalize a chroma vector (from audio or midi) to have
// mean of 0 and std. dev. of 1
//
static void normalize(float *cv)
{
float avg = 0;
for (int i = 0; i < CHROMA_BIN_COUNT; i++) {
avg += cv[i];
}
avg /= CHROMA_BIN_COUNT;
/* Normalize this frame to avg. 0 */
for (int i = 0; i < CHROMA_BIN_COUNT; i++)
cv[i] -= avg;
/* Calculate std. dev. for this frame */
float sum = 0;
for (int i = 0; i < CHROMA_BIN_COUNT; i++) {
float x = cv[i];
sum += x * x;
}
float dev = sqrt(sum / CHROMA_BIN_COUNT);
if (dev == 0.0) dev = 1.0F; /* don't divide by zero */
/* Normalize this frame to std. dev. 1*/
for (int i = 0; i < CHROMA_BIN_COUNT; i++) cv[i] /= dev;
}
/* GEN_CHROMA_AUDIO -- compute chroma for an audio file
*/
/*
@@ -153,8 +175,8 @@ int nextPowerOf2(int n)
(aka the length of the 1st dimention of chrom_energy)
*/
int Scorealign::gen_chroma_audio(Audio_reader &reader, int hcutoff,
int lcutoff, float **chrom_energy, float *actual_frame_period,
int id, bool verbose)
int lcutoff, float **chrom_energy, double *actual_frame_period,
int id)
{
int i;
double sample_rate = reader.get_sample_rate();
@@ -165,9 +187,12 @@ int Scorealign::gen_chroma_audio(Audio_reader &reader, int hcutoff,
printf ("==============FILE %d====================\n", id);
reader.print_info();
}
// this seems like a poor way to set actual_frame_period_1 or _2 in
#if DEBUG_LOG
fprintf(dbf, "******** BEGIN AUDIO CHROMA COMPUTATION *********\n");
#endif
// this seems like a poor way to set actual_frame_period_0 or _1 in
// the Scorealign object, but I'm not sure what would be better:
*actual_frame_period = reader.actual_frame_period;
*actual_frame_period = float(reader.actual_frame_period);
for (i = 0; i < CHROMA_BIN_COUNT; i++) {
reg11[i] = -999;
@@ -230,7 +255,7 @@ int Scorealign::gen_chroma_audio(Audio_reader &reader, int hcutoff,
// sample_rate / full_data_size);
double freq = low_bin * sample_rate / full_data_size;
for (i = low_bin; i < high_bin; i++) {
float raw_bin = hz_to_step(freq);
float raw_bin = hz_to_step(float(freq));
int round_bin = (int) (raw_bin + 0.5F);
int mod_bin = round_bin % 12;
bin_map[i] = mod_bin;
@@ -238,24 +263,35 @@ int Scorealign::gen_chroma_audio(Audio_reader &reader, int hcutoff,
}
// printf("BIN_COUNT is !!!!!!!!!!!!! %d\n",CHROMA_BIN_COUNT);
// create Hamming window data
float *hamming = ALLOC(float, reader.samples_per_frame);
gen_Hamming(hamming, reader.samples_per_frame);
while (reader.read_window(full_data)) {
//fill out array with 0's till next power of 2
#ifdef SA_VERBOSE
printf("samples_per_frame %d sample %g\n", reader.samples_per_frame,
full_data[0]);
fprintf(dbf, "samples_per_frame %d sample %g\n",
reader.samples_per_frame, full_data[0]);
#endif
for (i = reader.samples_per_frame; i < full_data_size; i++)
full_data[i] = 0;
#ifdef AS_VERBOSE
printf("preFFT: full_data[1000] %g\n", full_data[1000]);
#ifdef SA_VERBOSE
fprintf(dbf, "preFFT: full_data[1000] %g\n", full_data[1000]);
#endif
//the data from the wave file, each point mult by a hamming value
gen_Hamming(full_data, full_data_size, full_data);
// compute the RMS, then apply the Hamming window to the data
float rms = 0.0f;
for (i = 0; i < reader.samples_per_frame; i++) {
float x = full_data[i];
rms += x * x;
full_data[i] = x * hamming[i];
}
rms = sqrt(rms / reader.samples_per_frame);
#ifdef SA_VERBOSE
printf("preFFT: hammingData[1000] %g\n", full_data[1000]);
fprintf(dbf, "preFFT: hammingData[1000] %g\n",
full_data[1000]);
#endif
FFT3(full_data_size, 0, full_data, NULL, fft_dataR, fft_dataI); //fft3
@@ -322,19 +358,42 @@ int Scorealign::gen_chroma_audio(Audio_reader &reader, int hcutoff,
//put chrom energy into the returned array
#ifdef SA_VERBOSE
printf("cv_index %d\n", cv_index);
fprintf(dbf, "cv_index %d\n", cv_index);
#endif
assert(cv_index < reader.frame_count);
for (i = 0; i < CHROMA_BIN_COUNT; i++)
CHROM(cv_index, i) = binEnergy[i] / binCount[i];
float *cv = AREF1(*chrom_energy, cv_index);
for (i = 0; i < CHROMA_BIN_COUNT; i++) {
cv[i] = binEnergy[i] / binCount[i];
}
if (rms < silence_threshold) {
// "silence" flag
cv[CHROMA_BIN_COUNT] = 1.0f;
} else {
cv[CHROMA_BIN_COUNT] = 0.0f;
// normalize the non-silent frames
normalize(cv);
}
#if DEBUG_LOG
fprintf(dbf, "%d@%g) ", cv_index, cv_index * reader.actual_frame_period);
for (int i = 0; i < CHROMA_BIN_COUNT; i++) {
fprintf(dbf, "%d:%g ", i, cv[i]);
}
fprintf(dbf, " sil?:%g\n\n", cv[CHROMA_BIN_COUNT]);
#endif
cv_index++;
if (progress && cv_index % 10 == 0 &&
!progress->set_feature_progress(
float(cv_index * reader.actual_frame_period))) {
break;
}
} // end of while ((readcount = read_mono_floats...
free(hamming);
free(fft_dataI);
free(fft_dataR);
free(full_data);
if (verbose)
printf("\nGenerated Chroma. file%d_frames is %i\n", id, file1_frames);
printf("\nGenerated Chroma. file%d_frames is %i\n", id, file0_frames);
return cv_index;
}
@@ -362,7 +421,7 @@ typedef Event_list *Event_list_ptr;
The chroma energy is placed in the float *chrom_energy.
this 2D is an array of pointers.
The function returns the number of frames
(aka the length of the 1st dimention of chrom_energy)
(aka the length of the 1st dimension of chrom_energy)
*
*
Notes: keep a list of notes that are sounding.
@@ -374,25 +433,33 @@ typedef Event_list *Event_list_ptr;
How many frames?
*/
int Scorealign::gen_chroma_midi(Alg_seq &seq, int hcutoff, int lcutoff,
float **chrom_energy, float *actual_frame_period,
int id, bool verbose)
int Scorealign::gen_chroma_midi(Alg_seq &seq, float dur, int nnotes,
int hcutoff, int lcutoff,
float **chrom_energy, double *actual_frame_period,
int id)
{
// silence_threshold is compared to the *average* of chroma bins.
// Rather than divide the sum by CHROMA_BIN_COUNT to compute the
// average, just compute the sum and compare to silence_threshold * 12
float threshold = (float) (silence_threshold * CHROMA_BIN_COUNT);
if (verbose) {
printf ("==============FILE %d====================\n", id);
SA_V(seq.write(cout, true));
}
/*=============================================================*/
#if DEBUG_LOG
fprintf(dbf, "******** BEGIN MIDI CHROMA COMPUTATION *********\n");
#endif /*=============================================================*/
*actual_frame_period = (frame_period) ; // since we don't quantize to samples
*actual_frame_period = frame_period; // since we don't quantize to samples
/*=============================================================*/
seq.convert_to_seconds();
/* find duration */
float dur = 0.0F;
int nnotes = 0;
nnotes= find_midi_duration(seq, &dur);
///* find duration */
//float dur = 0.0F;
//int nnotes = 0;
//nnotes = find_midi_duration(seq, &dur);
/*================================================================*/
@@ -417,13 +484,15 @@ int Scorealign::gen_chroma_midi(Alg_seq &seq, int hcutoff, int lcutoff,
/*====================================================*/
float frame_begin = max((cv_index * (frame_period)) -
window_size/2 , 0.0F); //chooses zero if negative
float frame_begin = (float) max(cv_index * frame_period -
window_size / 2.0, 0.0);
//chooses zero if negative
float frame_end= frame_begin +(window_size/2);
float frame_end = (float) (cv_index * frame_period + window_size / 2.0);
/*============================================================*/
float *cv = AREF1(*chrom_energy, cv_index);
/* zero the vector */
for (int i = 0; i < CHROMA_BIN_COUNT; i++) CHROM(cv_index, i) = 0;
for (int i = 0; i < CHROMA_BIN_COUNT + 1; i++) cv[i] = 0;
/* add new notes that are in the frame */
while (event && event->time < frame_end) {
if (event->is_note()) {
@@ -442,6 +511,7 @@ int Scorealign::gen_chroma_midi(Alg_seq &seq, int hcutoff, int lcutoff,
}
if (*ptr) ptr = &((*ptr)->next);
}
float sum = 0.0;
for (Event_list_ptr item = list; item; item = item->next) {
/* compute duration of overlap */
float overlap =
@@ -450,18 +520,34 @@ int Scorealign::gen_chroma_midi(Alg_seq &seq, int hcutoff, int lcutoff,
float velocity = item->note->loud;
float weight = overlap * velocity;
#if DEBUG_LOG
fprintf(dbf, "%3d pitch %g key %d overlap %g velocity %g\n",
cv_index, item->note->pitch, item->note->get_identifier(),
overlap, velocity);
fprintf(dbf, "%3d pitch %g starting %g key %d overlap %g velocity %g\n",
cv_index, item->note->pitch, item->note->time,
item->note->get_identifier(), overlap, velocity);
#endif
CHROM(cv_index, (int)item->note->pitch % 12) += weight;
cv[(int) item->note->pitch % 12] += weight;
sum += weight;
}
if (sum < threshold) {
cv[CHROMA_BIN_COUNT] = 1.0;
} else {
normalize(cv);
}
#if DEBUG_LOG
fprintf(dbf, "%d@%g) ", cv_index, frame_begin);
for (int i = 0; i < CHROMA_BIN_COUNT; i++) {
fprintf(dbf, "%d:%g ", i, CHROM(cv_index, i));
fprintf(dbf, "%d:%g ", i, cv[i]);
}
fprintf(dbf, "\n\n");
fprintf(dbf, " sil?:%g\n\n", cv[CHROMA_BIN_COUNT]);
#endif
if (cv_index % 10 == 0 && progress &&
!progress->set_feature_progress(
float(cv_index * *actual_frame_period))) {
break;
}
}
while (list) {
Event_list_ptr temp = list;
@@ -470,6 +556,6 @@ int Scorealign::gen_chroma_midi(Alg_seq &seq, int hcutoff, int lcutoff,
}
iterator.end();
if (verbose)
printf("\nGenerated Chroma. file%d_frames is %i\n", id, file1_frames);
printf("\nGenerated Chroma. file%d_frames is %i\n", id, file0_frames);
return frame_count;
}

View File

@@ -2,5 +2,10 @@
bool is_midi_file(char *filename);
#define AREF2(chrom_energy, row, column) \
(chrom_energy[row * (CHROMA_BIN_COUNT + 1) + column])
// index into matrix to extract chroma vector
#define AREF1(chrom_energy, row) \
((chrom_energy) + (row) * (CHROMA_BIN_COUNT + 1))
// index into matrix to extract element of chroma vector
#define AREF2(chrom_energy, row, column) AREF1(chrom_energy, row)[column]

View File

@@ -26,12 +26,33 @@
* maximum.
*/
#include "hillclimb.h"
#include "stdio.h"
#include "malloc.h"
#include "sautils.h"
#include "hillclimb.h"
#define HC_VERBOSE 0
#define V if (HC_VERBOSE)
Hillclimb::~Hillclimb()
{
if (parameters) FREE(parameters);
if (step_size) FREE(step_size);
if (min_param) FREE(min_param);
if (max_param) FREE(max_param);
}
void Hillclimb::setup(int n_) {
n = n_;
parameters = ALLOC(double, n);
step_size = ALLOC(double, n);
min_param = ALLOC(double, n);
max_param = ALLOC(double, n);
}
void Hillclimb::set_parameters(double *p, double *ss,
double *min_, double *max_, int plen)
{
@@ -108,17 +129,20 @@ double Hillclimb::optimize()
}
*/
double Hillclimb::optimize()
double Hillclimb::optimize(Report_fn_ptr report, void *cookie)
{
double best = evaluate();
int iterations = 0;
while (true) {
(*report)(cookie, iterations, best);
V printf("best %g ", best);
// eval partial derivatives
int i;
// variables to search for max partial derivative
double max_y = best; // max of evaluate() so far
int max_i; // index where best max was found
double max_parameter; // the good parameter value for max_i
int max_i = 0; // index where best max was found
// the good parameter value for max_i:
double max_parameter = parameters[0];
// now search over all parameters for best improvement
for (i = 0; i < n; i++) {
V printf("optimize at %d param %g ", i, parameters[i]);
@@ -148,8 +172,10 @@ double Hillclimb::optimize()
parameters[i] = save_param;
V printf("\n");
}
iterations++; // for debugging, reporting
if (max_y <= best) { // no improvement, we're done
V printf("\nCompleted hillclimbing, best %g\n", best);
(*report)(cookie, iterations, best);
return best;
}
// improvement because max_y higher than best:

View File

@@ -15,6 +15,9 @@
*
*/
// while optimizing, this function is called to report progress
typedef void (*Report_fn_ptr)(void *cookie, int iteration, double best);
class Hillclimb {
protected:
double *parameters; // parameters to optimize
@@ -24,12 +27,17 @@ protected:
double *max_param; // maximum parameter values
int n; // number of parameters
public:
Hillclimb() {
parameters = step_size = min_param = max_param = NULL;
}
void setup(int n_);
~Hillclimb();
void set_parameters(double *parameters_, double *step_size_,
double *min_, double *max_, int n_);
// retrieve parameters after optimization:
double *get_parameters() { return parameters; }
virtual double evaluate() = 0;
double optimize();
double optimize(Report_fn_ptr report, void *cookie);
};

View File

@@ -8,6 +8,7 @@
*/
#define ALLOC(t, n) (t *) malloc(sizeof(t) * (n))
#define FREE(p) free(p)
#define ROUND(x) ((int) (0.5 + (x)))

View File

@@ -4,21 +4,41 @@
*/
#include "allegro.h"
#include "scorealign-glue.h"
#include "audioreader.h"
#include "audiomixerreader.h"
#include "scorealign.h"
#include "scorealign-glue.h"
#include "audiomixerreader.h"
void scorealign(void *mixer, mixer_process_fn fn_ptr, int chans, double srate,
double end_time, Alg_seq *seq)
int scorealign(void *mixer, mixer_process_fn fn_ptr, int chans, double srate,
double end_time, Alg_seq *seq, SAProgress *progress,
ScoreAlignParams &params)
{
Scorealign sa;
sa.frame_period = 0.2;
sa.window_size = 0.2;
sa.frame_period = params.mFramePeriod;
sa.window_size = params.mWindowSize;
sa.silence_threshold = params.mSilenceThreshold;
sa.force_final_alignment = (params.mForceFinalAlignment != 0.0);
sa.ignore_silence = (params.mIgnoreSilence != 0.0);
sa.presmooth_time = params.mPresmoothTime;
sa.line_time = params.mLineTime;
sa.smooth_time = params.mSmoothTime;
Audio_mixer_reader reader(mixer, fn_ptr, chans, srate, end_time);
reader.calculate_parameters(sa, false);
sa.align_midi_to_audio(*seq, reader, true);
sa.midi_tempo_align(*seq, false);
sa.progress = progress;
int result = sa.align_midi_to_audio(*seq, reader);
params.mMidiStart = sa.first_x * sa.actual_frame_period_0;
params.mMidiEnd = (sa.last_x + 1) * sa.actual_frame_period_0;
params.mAudioStart = sa.first_y * sa.actual_frame_period_1;
params.mAudioEnd = (sa.last_y + 1) * sa.actual_frame_period_1;
if (result != SA_SUCCESS) {
return result;
}
sa.midi_tempo_align(*seq);
// seq has now been modified to conform to audio provided by mixer
seq->set_real_dur(end_time);
return SA_SUCCESS; // success
}

View File

@@ -1,5 +1,8 @@
typedef long (*mixer_process_fn)(void *mix, float **buffer, long n);
void scorealign(void *mixer, mixer_process_fn fn_ptr,
#include "ScoreAlignParams.h"
int scorealign(void *mixer, mixer_process_fn fn_ptr,
int chans, double srate,
double end_time, Alg_seq *seq);
double end_time, Alg_seq *seq, SAProgress *progress,
ScoreAlignParams &params);

View File

@@ -24,7 +24,7 @@
#define LOW_CUTOFF 40
#define HIGH_CUTOFF 2000
// Note: There are "verbose" flags passed as parameters that
// Note: There is a "verbose" flag in Score_align objects that
// enable some printing. The SA_VERBOSE compiler flag causes a
// lot more debugging output, so it could be called VERY_VERBOSE
// as opposed to the quieter verbose flags.
@@ -36,10 +36,10 @@
// for presmoothing, how near does a point have to be to be "on the line"
#define NEAR 1.5
// path is file1_frames by file2_frames array, so first index
// (rows) is in [0 .. file1_frames]. Array is sequence of rows.
// columns (j) ranges from [0 .. file2_frames]
#define PATH(i,j) (path[(i) * file2_frames + (j)])
// path is file0_frames by file1_frames array, so first index
// (rows) is in [0 .. file0_frames]. Array is sequence of rows.
// columns (j) ranges from [0 .. file1_frames]
#define PATH(i,j) (path[(i) * file1_frames + (j)])
/*===========================================================================*/
@@ -48,21 +48,52 @@ FILE *dbf = NULL;
#endif
Scorealign::Scorealign() {
frame_period = SA_DFT_FRAME_PERIOD;
window_size = SA_DFT_WINDOW_SIZE;
force_final_alignment = SA_DFT_FORCE_FINAL_ALIGNMENT;
ignore_silence = SA_DFT_IGNORE_SILENCE;
silence_threshold = SA_DFT_SILENCE_THRESHOLD;
presmooth_time = SA_DFT_PRESMOOTH_TIME;
line_time = SA_DFT_LINE_TIME;
smooth_time = SA_DFT_SMOOTH_TIME;
pathlen = 0;
path_count = 0;
pathx = NULL;
pathy = NULL;
verbose = false;
progress = NULL;
#if DEBUG_LOG
dbf = fopen("debug-log.txt", "w");
assert(dbf);
#endif
}
Scorealign::~Scorealign() {
if (pathx) free(pathx);
if (pathy) free(pathy);
#if DEBUG_LOG
fclose(dbf);
#endif
}
/* MAP_TIME
lookup time of file1 in smooth_time_map and interpolate
to get time in file2
lookup time of file0 in smooth_time_map and interpolate
to get time in file1
*/
float Scorealign::map_time(float t1)
{
t1 /= actual_frame_period_1; // convert from seconds to frames
t1 /= (float) actual_frame_period_0; // convert from seconds to frames
int i = (int) t1; // round down
if (i < 0) i = 0;
if (i >= file1_frames - 1) i = file1_frames - 2;
if (i >= file0_frames - 1) i = file0_frames - 2;
// interpolate to get time
return actual_frame_period_2 *
return float(actual_frame_period_1 *
interpolate(i, smooth_time_map[i], i+1, smooth_time_map[i+1],
t1);
t1));
}
@@ -86,7 +117,7 @@ int find_midi_duration(Alg_seq &seq, float *dur)
Alg_event_ptr e = notes[i];
if (e->is_note()) {
Alg_note_ptr n = (Alg_note_ptr) e;
float note_end = n->time + n->dur;
float note_end = float(n->time + n->dur);
if (note_end > *dur) *dur = note_end;
nnotes++;
}
@@ -127,9 +158,9 @@ void Scorealign::path_step(int i, int j)
{
#if DEBUG_LOG
fprintf(dbf, "(%i,%i) ", i, j);
if (++path_count % 5 == 0 ||
(i == 0 && j == 0))
fprintf(dbf, "\n");
if (++path_count % 5 == 0 ||
(i == first_x && j == first_y))
fprintf(dbf, "\n");
#endif
pathx[pathlen] = i;
pathy[pathlen] = j;
@@ -169,8 +200,8 @@ returns the first index in pathy where the element is bigger than sec
*/
int Scorealign::sec_to_pathy_index(float sec)
{
for (int i = 0 ; i < (file1_frames + file2_frames); i++) {
if (smooth_time_map[i] * actual_frame_period_2 >= sec) {
for (int i = 0 ; i < (file0_frames + file1_frames); i++) {
if (smooth_time_map[i] * actual_frame_period_1 >= sec) {
return i;
}
//printf("%i\n" ,pathy[i]);
@@ -184,17 +215,21 @@ given a chrom_energy vector, sees how many
of the inital frames are designated as silent
*/
int frames_of_init_silence( float *chrom_energy, int frame_count)
int frames_of_init_silence(float *chrom_energy, int frame_count)
{
bool silence = true;
int frames=0;
while (silence) {
if (silent(frames, chrom_energy))
frames++;
else
silence=false;
int frames;
for (frames = 0; frames < frame_count; frames++) {
if (!silent(frames, chrom_energy)) break;
}
return frames;
}
int last_non_silent_frame(float *chrom_energy, int frame_count)
{
int frames;
for (frames = frame_count - 1; frames > 0; frames--) {
if (!silent(frames, chrom_energy)) break;
}
return frames;
}
@@ -202,95 +237,130 @@ int frames_of_init_silence( float *chrom_energy, int frame_count)
/* COMPARE_CHROMA
Perform Dynamic Programming to find optimal alignment
*/
void Scorealign::compare_chroma(bool verbose)
int Scorealign::compare_chroma()
{
float *path;
int x = 0;
int y = 0;
/* Allocate the distance matrix */
path = (float *) calloc(file1_frames * file2_frames, sizeof(float));
path = (float *) calloc(file0_frames * file1_frames, sizeof(float));
/* Initialize first row and column */
/* skip over initial silence in signals */
if (ignore_silence) {
first_x = frames_of_init_silence(chrom_energy0, file0_frames);
last_x = last_non_silent_frame(chrom_energy0, file0_frames);
first_y = frames_of_init_silence(chrom_energy1, file1_frames);
last_y = last_non_silent_frame(chrom_energy1, file1_frames);
} else {
first_x = 0;
last_x = file0_frames - 1;
first_y = 0;
last_y = file1_frames - 1;
}
/* allow free skip over initial silence in either signal, but not both */
/* silence is indicated by a run of zeros along the first row and or
* column, starting at the origin (0,0). After computing these runs, we
* put the proper value at (0,0)
*/
if (verbose) printf("Performing silent skip DP \n");
PATH(0, 0) = (silent(0, chrom_energy1) ? 0 :
gen_dist(0, 0, chrom_energy1, chrom_energy2));
for (int i = 1; i < file1_frames; i++)
PATH(i, 0) = (PATH(i-1, 0) == 0 && silent(i, chrom_energy1) ? 0 :
gen_dist(i, 0, chrom_energy1, chrom_energy2) +
PATH(i-1, 0));
PATH(0, 0) = (silent(0, chrom_energy2) ? 0 :
gen_dist(0, 0, chrom_energy1, chrom_energy2));
for (int j = 1; j < file2_frames; j++)
PATH(0, j) = (PATH(0, j-1) == 0 && silent(j, chrom_energy2) ? 0 :
gen_dist(0, j, chrom_energy1, chrom_energy2) +
PATH(0, j-1));
/* first row and first column are done, put proper value at (0,0) */
PATH(0, 0) = (!silent(0, chrom_energy1) || !silent(0, chrom_energy2) ?
gen_dist(0, 0, chrom_energy1, chrom_energy2) : 0);
if (last_x - first_x <= 0 || last_y - first_y <= 0) {
return SA_TOOSHORT;
}
/* Initialize first row and column */
if (verbose) printf("Performing DP\n");
PATH(first_x, first_y) = gen_dist(first_x, first_y);
for (int x = first_x + 1; x <= last_x; x++)
PATH(x, first_y) = gen_dist(x, first_y) + PATH(x - 1, first_y);
for (int y = 1; y <= last_y; y++)
PATH(first_x, y) = gen_dist(first_x, y) + PATH(first_x, y - 1);
#if DEBUG_LOG
fprintf(dbf, "DISTANCE MATRIX ***************************\n");
#endif
/* Perform DP for the rest of the matrix */
for (int i = 1; i < file1_frames; i++)
for (int j = 1; j < file2_frames; j++)
PATH(i, j) = gen_dist(i, j, chrom_energy1, chrom_energy2) +
min3(PATH(i-1, j-1), PATH(i-1, j), PATH(i, j-1));
for (int x = first_x + 1; x <= last_x; x++) {
for (int y = first_y + 1; y <= last_y; y++) {
PATH(x, y) = gen_dist(x, y) +
float(min3(PATH(x-1, y-1), PATH(x-1, y), PATH(x, y-1)));
#if DEBUG_LOG
fprintf(dbf, "(%d %d %g) ", x, y, gen_dist(x, y), PATH(x, y));
#endif
}
#if DEBUG_LOG
fprintf(dbf, "\n");
#endif
// report progress for each file0_frame (column)
// This is not quite right if we are ignoring silence because
// then only a sub-matrix is computed.
if (progress && !progress->set_matrix_progress(file1_frames))
return SA_CANCEL;
}
#if DEBUG_LOG
fprintf(dbf, "END OF DISTANCE MATRIX ********************\n");
#endif
if (verbose) printf("Completed Dynamic Programming.\n");
x = file1_frames - 1;
y = file2_frames - 1;
//x and y are the ending points, it can end at either the end of midi,
// or end of audio but not both
pathx = ALLOC(short, (x + y + 2));
pathy = ALLOC(short, (x + y + 2));
// or end of audio or both
pathx = ALLOC(short, (file0_frames + file1_frames));
pathy = ALLOC(short, (file0_frames + file1_frames));
assert(pathx != NULL);
assert(pathy != NULL);
// map from file1 time to file2 time
time_map = ALLOC(float, file1_frames);
smooth_time_map = ALLOC(float, file1_frames);
// map from file0 time to file1 time
time_map = ALLOC(float, file0_frames);
smooth_time_map = ALLOC(float, file0_frames);
int x = last_x;
int y = last_y;
if (!force_final_alignment) {
#if DEBUG_LOG
fprintf(dbf, "\nOptimal Path: ");
fprintf(dbf, "\nOptimal Path: ");
#endif
while (1) {
/* Check for stopping */
if (x == 0 & y == 0) {
path_step(0, 0);
path_reverse();
break;
// find end point, the lowest cost matrix value at one of the
// sequence endings
float min_cost = 1.0E10;
for (int i = first_x; i <= last_x; i++) {
if (PATH(i, last_y) <= min_cost) {
min_cost = PATH(i, last_y);
x = i;
y = last_y;
}
}
/* Print the current coordinate in the path*/
for (int j = first_y; j <= last_y; j++) {
if (PATH(last_x, j) <= min_cost) {
min_cost = PATH(last_x, j);
x = last_x;
y = j;
}
}
#if DEBUG_LOG
fprintf(dbf, "Min cost at %d %d\n\nPATH:\n", x, y);
#endif
}
while ((x != first_x) || (y != first_y)) {
path_step(x, y);
/* Check for the optimal path backwards*/
if (x > 0 && y > 0 && PATH(x-1, y-1) <= PATH(x-1, y) &&
if (x > first_x && y > first_y && PATH(x-1, y-1) <= PATH(x-1, y) &&
PATH(x-1, y-1) <= PATH(x, y-1)) {
x--;
y--;
} else if (x > 0 && y > 0 && PATH(x-1, y) <= PATH(x, y-1)) {
} else if (x > first_x && y > first_y && PATH(x-1, y) <= PATH(x, y-1)) {
x--;
} else if (y > 0) {
} else if (y > first_y) {
y--;
} else if (x > 0) {
} else if (x > first_x) {
x--;
}
}
path_step(x, y);
path_reverse();
free(path);
return SA_SUCCESS; // success
}
void Scorealign::linear_regression(int n, int width, float &a, float &b)
{
int hw = (width - 1) / 2; // a more convenient form: 1/2 width
@@ -316,32 +386,36 @@ void Scorealign::linear_regression(int n, int width, float &a, float &b)
}
/* COMPUTE_SMOOTH_TIME_MAP
compute regression line and estimate point at i
Number of points in regression is smooth (an odd number). First
index to compute is (smooth-1)/2. Use that line for the first
(smooth+1)/2 points. The last index to compute is
(file1_frames - (smooth+1)/2). Use that line for the last
(file0_frames - (smooth+1)/2). Use that line for the last
(smooth+1)/2 points.
*/
void Scorealign::compute_smooth_time_map()
{
int i;
int hw = (smooth - 1) / 2; // half width of smoothing window
// find the first point
for (i = 0; i < first_x; i++) {
smooth_time_map[i] = NOT_MAPPED;
}
// do the first points:
float a, b;
linear_regression((smooth - 1) / 2, smooth, a, b);
int i;
for (i = 0; i < (smooth + 1) / 2; i++) {
smooth_time_map[i] = a + b*i;
linear_regression(first_x + hw, smooth, a, b);
for (i = first_x; i <= first_x + hw; i++) {
smooth_time_map[i] = a + b * i;
}
// do the middle points:
for (i = (smooth + 1) / 2; i < file1_frames - (smooth + 1) / 2; i++) {
for (i = first_x + hw + 1; i < last_x - hw; i++) {
linear_regression(i, smooth, a, b);
smooth_time_map[i] = a + b*i;
smooth_time_map[i] = a + b * i;
#if DEBUG_LOG
fprintf(dbf, "time_map[%d] = %g, smooth_time_map[%d] = %g\n",
@@ -349,14 +423,15 @@ void Scorealign::compute_smooth_time_map()
#endif
}
// do the last points
linear_regression(file1_frames - (smooth + 1) / 2, smooth, a, b);
for (i = file1_frames - (smooth + 1) / 2; i < file1_frames; i++) {
smooth_time_map[i] = a + b*i;
linear_regression(last_x - hw, smooth, a, b);
for (i = last_x - hw; i <= last_x; i++) {
smooth_time_map[i] = a + b * i;
}
// finally, fill with NOT_MAPPED
for (i = last_x + 1; i < file0_frames; i++)
smooth_time_map[i] = NOT_MAPPED;
}
@@ -401,16 +476,17 @@ short *path_copy(short *path, int len)
*/
void Scorealign::presmooth()
{
int n = ROUND(presmooth_time / actual_frame_period_2);
int n = ROUND(presmooth_time / actual_frame_period_1);
n = (n + 3) & ~3; // round up to multiple of 4
int i = 0;
while (pathx[i] + n < file2_frames) {
while (i < pathlen - 1 && pathx[i] + n <= last_x) {
/* line goes from i to i+n-1 */
int x1 = pathx[i];
int xmid = x1 + n/2;
int x2 = x1 + n;
int y1 = pathy[i];
int y2;
int y2 = pathy[i + 1]; // make sure it has a value. y2 should be
// set in the loop below.
int j;
/* search for y2 = pathy[j] s.t. pathx[j] == x2 */
for (j = i + n; j < pathlen; j++) {
@@ -424,7 +500,8 @@ void Scorealign::presmooth()
int k = i;
int count = 0;
while (pathx[k] < xmid) { // search first half
if (near_line(x1, y1, x2, y2, pathx[k], pathy[k])) {
if (near_line(float(x1), float(y1), float(x2), float(y2),
pathx[k], pathy[k])) {
count++;
regr.point(pathx[k], pathy[k]);
}
@@ -437,7 +514,8 @@ void Scorealign::presmooth()
}
/* see if line fits top half of the data */
while (pathx[k] < x2) {
if (near_line(x1, y1, x2, y2, pathx[k], pathy[k])) {
if (near_line(float(x1), float(y1), float(x2), float(y2),
pathx[k], pathy[k])) {
count++;
regr.point(pathx[k], pathy[k]);
}
@@ -511,11 +589,6 @@ void Scorealign::presmooth()
// make sure new path is no longer than original path
// the last point we wrote was k - 1
k = k - 1; // the last point we wrote is now k
// DEBUG
if (k > j) {
printf("oops: k %d, j %d\n", k, j);
SA_V(print_path_range(pathx, pathy, i, k);)
}
assert(k <= j);
// if new path is shorter than original, then fix up path
if (k < j) {
@@ -539,19 +612,28 @@ void Scorealign::presmooth()
*/
void Scorealign::compute_regression_lines()
{
// first, compute the y value of the path at
int i;
// fill in time_map with NOT_MAPPED until the first point
// of the path
for (i = 0; i < pathx[0]; i++) {
time_map[i] = NOT_MAPPED;
}
// now, compute the y value of the path at
// each x value. If the path has multiple values
// on x, take the average.
int p = 0;
int i;
int upper, lower;
for (i = 0; i < file1_frames; i++) {
for (i = pathx[0]; p < pathlen; i++) {
lower = pathy[p];
while (p < pathlen && pathx[p] == i) {
upper = pathy[p];
p = p + 1;
}
time_map[i] = (lower + upper) * 0.5;
time_map[i] = (lower + upper) * 0.5F;
}
// fill in rest of time_map with NOT_MAPPED
for (i = pathx[pathlen - 1] + 1; i <= last_x; i++) {
time_map[i] = NOT_MAPPED;
}
// now fit a line to the nearest WINDOW points and record the
// line's y value for each x.
@@ -559,115 +641,196 @@ void Scorealign::compute_regression_lines()
}
void Scorealign::midi_tempo_align(Alg_seq &seq, bool verbose)
void Scorealign::midi_tempo_align(Alg_seq &seq)
{
// We create a new time map out of the alignment, and replace
// the original time map in the Alg_seq sequence
Alg_seq new_time_map_seq;
/** align at all integer beats **/
int totalbeats;
float dur_in_sec;
// probably alignment should respect the real_dur encoded into the seq
// rather than computing real_dur based on note off times -- the
// caller should be required to set real_dur to a good value, and
// the find_midi_duration() function should be available to the caller
// if necessary -RBD
find_midi_duration(seq, &dur_in_sec);
//
// totalbeat = lastbeat + 1 and round up the beat
totalbeats = (int) (seq.get_time_map()->time_to_beat(dur_in_sec) + 2);
if (verbose)
// totalbeats = lastbeat + 1 and round up the beat
int totalbeats = (int) seq.get_beat_dur() + 2;
if (verbose) {
double dur_in_sec = seq.get_real_dur();
printf("midi duration = %f, totalbeats=%i \n", dur_in_sec, totalbeats);
}
#ifdef DEBUG_LOG
fprintf(dbf, "***************** CONSTRUCTING TIME MAP ***************\n");
#endif
// turn off last tempo flag so last tempo will extrapolate
new_time_map_seq.get_time_map()->last_tempo_flag = false;
int first_beat = -1;
for (int i = 0; i < totalbeats; i++) {
double newtime = map_time(seq.get_time_map()->beat_to_time(i));
if (newtime > 0)
double newtime = map_time(float(seq.get_time_map()->beat_to_time(i)));
if (newtime > 0) {
new_time_map_seq.insert_beat(newtime, (double) i);
// remember where the new time map begins
if (first_beat < 0) first_beat = i;
#ifdef DEBUG_LOG
fprintf(dbf, "map beat %d to time %g\n", i, newtime);
#endif
}
}
seq.convert_to_beats();
seq.set_time_map(new_time_map_seq.get_time_map());
double end_beat = seq.get_dur();
Alg_time_map_ptr map = new_time_map_seq.get_time_map();
seq.set_time_map(map);
// the new time map begins where the alignment began, but due to
// smoothing and rounding, there may be some edge effects.
// Try to set the tempo before the first_beat to match the tempo
// at the first beat by introducing another time map point at least
// one beat before the first_beat. To do this, we need at least
// 2 beats before first_beat and at least 2 beats in the time map
// (needed to compute initial tempo). Furthermore, the tempo at
// first_beat could be so slow that we do not have enough time
// before first_beat to anticipate the tempo.
if (first_beat >= 2 && totalbeats > first_beat + 1) {
int new_beat = first_beat / 2;
// compute initial tempo from first_beat and first_beat + 1
int i = map->locate_beat(first_beat);
double t1 = map->beats[i].time;
double t2 = map->beats[i + 1].time;
double spb = (t2 - t1); // seconds per beat, beat period
double new_time = t1 - (first_beat - new_beat) * spb;
if (new_time <= 0.2) {
// not enough time to start at new_time, new_beat
// let's try using half the time rather than half the beats
new_time = t1 / 2.0;
// this will round down, so new_beat < first_beat
new_beat = int(first_beat - (t1 / 2) / spb);
new_time = t1 - (first_beat - new_beat) * spb;
}
// need to check again if new_beat would be too early
if (new_time > 0.2) {
map->insert_beat(new_time, new_beat);
}
}
// Note: final tempo is extrapolated, so no need to insert new
// time map points beyond the last one
seq.set_dur(end_beat);
#ifdef DEBUG_LOG
fprintf(dbf, "\nend_beat %g end time %g\n",
seq.get_beat_dur(), seq.get_real_dur());
#endif
}
// this routine performs an alignment by adjusting midi to match audio
//
void Scorealign::align_midi_to_audio(Alg_seq &seq, Audio_reader &reader,
bool verbose)
int Scorealign::align_midi_to_audio(Alg_seq &seq, Audio_reader &reader)
{
/* Generate the chroma for file 1
float dur = 0.0F;
int nnotes = find_midi_duration(seq, &dur);
if (progress) {
progress->set_frame_period(frame_period);
progress->set_smoothing(line_time > 0.0);
progress->set_duration(0, false, dur);
progress->set_duration(1, true, float(reader.actual_frame_period *
reader.frame_count));
progress->set_phase(0);
}
/* Generate the chroma for file 0
* This will always be the MIDI File when aligning midi with audio.
*/
file1_frames = gen_chroma_midi(seq, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy1, &actual_frame_period_1, 1, verbose);
file0_frames = gen_chroma_midi(seq, dur, nnotes, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy0, &actual_frame_period_0, 0);
/* Generate the chroma for file 2 */
file2_frames = gen_chroma_audio(reader, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy2, &actual_frame_period_2, 2, verbose);
align_chromagrams(verbose);
/* Generate the chroma for file 1 */
if (progress) progress->set_phase(1);
file1_frames = gen_chroma_audio(reader, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy1, &actual_frame_period_1, 1);
return align_chromagrams();
}
void Scorealign::align_audio_to_audio(Audio_reader &reader1,
Audio_reader &reader2, bool verbose)
int Scorealign::align_audio_to_audio(Audio_reader &reader0,
Audio_reader &reader1)
{
if (progress) {
progress->set_frame_period(frame_period);
progress->set_duration(0, true, float(reader0.actual_frame_period *
reader0.frame_count));
progress->set_duration(1, true, float(reader1.actual_frame_period *
reader1.frame_count));
progress->set_phase(0);
progress->set_smoothing(line_time > 0.0);
}
file0_frames = gen_chroma_audio(reader0, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy0, &actual_frame_period_0, 0);
if (progress) progress->set_phase(1);
file1_frames = gen_chroma_audio(reader1, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy1, &actual_frame_period_1, 1, verbose);
file2_frames = gen_chroma_audio(reader2, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy2, &actual_frame_period_2, 2, verbose);
align_chromagrams(verbose);
&chrom_energy1, &actual_frame_period_1, 1);
return align_chromagrams();
}
void Scorealign::align_midi_to_midi(Alg_seq &seq1, Alg_seq &seq2,
bool verbose)
int Scorealign::align_midi_to_midi(Alg_seq &seq0, Alg_seq &seq1)
{
file1_frames = gen_chroma_midi(seq1, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy1, &actual_frame_period_1, 1, verbose);
float dur0 = 0.0F;
int nnotes0 = find_midi_duration(seq0, &dur0);
float dur1 = 0.0F;
int nnotes1 = find_midi_duration(seq1, &dur1);
if (progress) {
progress->set_frame_period(frame_period);
progress->set_smoothing(line_time > 0.0);
progress->set_duration(0, false, dur0);
progress->set_duration(1, false, dur1);
file2_frames = gen_chroma_midi(seq2, HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy2, &actual_frame_period_2, 2, verbose);
progress->set_phase(0);
}
file0_frames = gen_chroma_midi(seq0, dur0, nnotes0,
HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy0, &actual_frame_period_0, 0);
align_chromagrams(verbose);
if (progress) progress->set_phase(1);
file1_frames = gen_chroma_midi(seq1, dur1, nnotes1,
HIGH_CUTOFF, LOW_CUTOFF,
&chrom_energy1, &actual_frame_period_1, 1);
return align_chromagrams();
}
void Scorealign::align_chromagrams(bool verbose)
int Scorealign::align_chromagrams()
{
if (progress) progress->set_phase(2);
if (verbose)
printf("\nGenerated Chroma.\n");
/* now that we have actual_frame_period_2, we can compute smooth */
/* now that we have actual_frame_period_1, we can compute smooth */
// smooth is an odd number of frames that spans about smooth_time
smooth = ROUND(smooth_time / actual_frame_period_2);
smooth = ROUND(smooth_time / actual_frame_period_1);
if (smooth < 3) smooth = 3;
if (!(smooth & 1)) smooth++; // must be odd
if (verbose) {
printf("smoothing time is %g\n", smooth_time);
printf("smooth count is %d\n", smooth);
}
/* Normalize the chroma frames */
norm_chroma(file1_frames, chrom_energy1);
SA_V(printf("Chromagram data for file 0:\n");)
SA_V(print_chroma_table(chrom_energy0, file0_frames);)
SA_V(printf("Chromagram data for file 1:\n");)
SA_V(print_chroma_table(chrom_energy1, file1_frames);)
norm_chroma(file2_frames, chrom_energy2);
SA_V(printf("Chromagram data for file 2:\n");)
SA_V(print_chroma_table(chrom_energy2, file2_frames);)
if (verbose)
printf("Normalized Chroma.\n");
/* Compare the chroma frames */
compare_chroma(verbose);
int result = compare_chroma();
if (result != SA_SUCCESS) {
return result;
}
if (progress) progress->set_phase(3);
/* Compute the smooth time map now for use by curve-fitting */
compute_regression_lines();
/* if line_time is set, do curve-fitting */
if (line_time > 0.0) {
curve_fitting(this, verbose);
/* Redo the smooth time map after curve fitting or smoothing */
compute_regression_lines();
}
/* if presmooth_time is set, do presmoothing */
if (presmooth_time > 0.0) {
presmooth();
/* Redo the smooth time map after curve fitting or smoothing */
compute_regression_lines();
}
/* if line_time is set, do curve-fitting */
if (line_time > 0.0) {
curve_fitting(this, verbose);
/* Redo the smooth time map after curve fitting or smoothing */
compute_regression_lines();
}
if (progress) progress->set_phase(4);
return SA_SUCCESS;
}

View File

@@ -12,52 +12,142 @@
#define SA_V(stmt)
#endif
// a class to report (optionally) score alignment progress
class SAProgress {
public:
SAProgress() { smoothing = false; }
// we need the frame period to convert seconds to work units
// call this before set_duration()
virtual void set_frame_period(double seconds) { frame_period = seconds; };
// index = 0 or 1 to tell which file (first or second)
// is_audio = true (audio) or false (midi)
// seconds = duration of audio or midi data
virtual void set_duration(int index, bool audio_flag, double seconds) {
durations[index] = seconds;
is_audio[index] = audio_flag; };
// if fitting pwl path to path, set smoothing to true
virtual void set_smoothing(bool s) { smoothing = s; }
// which alignment phase are we working on?
// 0 = first file chroma, 1 = second file chroma, 2 = compute matrix,
// 3 = smoothing
// Note: set_phase(0) is REQUIRED and must be called only ONCE.
// This is when we calculate total work
// and initialize any local state needed to handle set_feature_progress()
// and set_matrix_progress().
virtual void set_phase(int i) { phase = i; };
// how many seconds have we processed (in phase 1 or 2)
// return value is normally true; false is request to cancel
virtual bool set_feature_progress(float seconds) { return true; };
// report that some matrix elements have been computed?
// return value is normally true; false is request to cancel
virtual bool set_matrix_progress(int cells) { return true; };
// report iterations of line smoothing
virtual bool set_smoothing_progress(int i) { return true; };
protected:
double frame_period;
int phase;
double durations[2];
bool is_audio[2];
bool smoothing;
};
enum {
SA_SUCCESS = 0,
SA_TOOSHORT,
SA_CANCEL
};
#define SA_DFT_FRAME_PERIOD 0.2
#define SA_DFT_FRAME_PERIOD_TEXT wxT("0.20 secs")
#define SA_DFT_WINDOW_SIZE 0.2
#define SA_DFT_WINDOW_SIZE_TEXT wxT("0.20 secs")
#define SA_DFT_FORCE_FINAL_ALIGNMENT true
#define SA_DFT_FORCE_FINAL_ALIGNMENT_STRING wxT("true")
#define SA_DFT_IGNORE_SILENCE true
#define SA_DFT_IGNORE_SILENCE_STRING wxT("true")
#define SA_DFT_SILENCE_THRESHOLD 0.1
#define SA_DFT_SILENCE_THRESHOLD_TEXT wxT("0.100")
#define SA_DFT_PRESMOOTH_TIME 0
#define SA_DFT_PRESMOOTH_TIME_TEXT wxT("(off)")
#define SA_DFT_LINE_TIME 0
#define SA_DFT_LINE_TIME_TEXT wxT("(off)")
#define SA_DFT_SMOOTH_TIME 1.75
#define SA_DFT_SMOOTH_TIME_TEXT wxT("1.75 secs")
class Scorealign {
public:
float frame_period; // time in seconds
float window_size;
float presmooth_time;
float line_time;
float smooth_time; // duration of smoothing window
double frame_period; // time in seconds
double window_size;
double silence_threshold;
bool force_final_alignment;
bool ignore_silence;
double presmooth_time;
double line_time;
double smooth_time; // duration of smoothing window
int smooth; // number of points used to compute the smooth time map
Scorealign() {
frame_period = 0.25;
window_size = 0.25;
presmooth_time = 0.0;
line_time = 0.0;
smooth_time = 1.75;
pathlen = 0;
path_count = 0;
pathx = NULL;
pathy = NULL;
}
Scorealign();
~Scorealign();
~Scorealign() {
if (pathx) free(pathx);
if (pathy) free(pathy);
}
SAProgress *progress;
bool verbose;
// chromagrams and lengths, path data
float *chrom_energy0;
int file0_frames; // number of frames in file0
float *chrom_energy1;
int file1_frames; // number of frames in file1
float *chrom_energy2;
int file2_frames; //number of frames in file2
int file1_frames; //number of frames in file1
// pathx, pathy, and pathlen describe the shortest path through the
// matrix from first_x, first_y to last_x, last_y (from the first
// non-silent frame to the last non-silent frame). The length varies
// depending upon the amount of silence that is ignored and how many
// path steps are diagonal.
short *pathx; //for midi (when aligning midi and audio)
short *pathy; //for audio (when aligning midi and audio)
int pathlen;
// first_x, first_y, last_x, last_y are the starting and ending
// points of the path. (It's not 0, 0, file0_frames, file1_frames
// because silent frames may be trimmed from beginning and ending.
int first_x;
int first_y;
int last_x;
int last_y;
void set_pathlen(int p) { pathlen = p; }
// time_map is, for each sequence 0 frame, the time of the matching
// frame in sequence 1. If the path associates a frame of sequence 0
// with multiple frames in sequence 1, the sequence 1 frame times
// are averaged. The frames that are not mapped to sequence 1 are
// marked with a time of -9999 or NOT_MAPPED.
// These will be silent frames of sequence 0.
#define NOT_MAPPED -9999.0F
float *time_map;
// smooth_time_map is a smoothed version of time_map. It also has
// non-mapped frames marked with times of -9999 or NOT_MAPPED.
// Because of smoothing, frames in smooth_time_map may map to
// negative times in sequence 1.
// These negative times will not be as negative as -9999, but
// the recommended coding style is to compare for equality with
// NOT_MAPPED to test for that value.
float *smooth_time_map;
// chroma vectors are calculated from an integer number of samples
// that approximates the nominal frame_period. Actual frame period
// is calculated and stored here:
// time in seconds for midi (when aligning midi and audio)
float actual_frame_period_1;
double actual_frame_period_0;
// time in seconds for audio (when aligning midi and audio)
float actual_frame_period_2;
double actual_frame_period_1;
/* gen_chroma.cpp stuff:
generates the chroma energy for a given file
@@ -69,36 +159,43 @@ class Scorealign {
(i.e. the length of the 1st dimention of chrom_energy
*/
int gen_chroma_audio(Audio_reader &reader, int hcutoff, int lcutoff,
float **chrom_energy, float *actual_frame_period,
int id, bool verbose);
float **chrom_energy, double *actual_frame_period,
int id);
int gen_chroma_midi(Alg_seq &seq, int hcutoff, int lcutoff,
float **chrom_energy, float *actual_frame_period,
int id, bool verbose);
int gen_chroma_midi(Alg_seq &seq, float dur, int nnotes,
int hcutoff, int lcutoff,
float **chrom_energy, double *actual_frame_period,
int id);
/* comp_chroma.cpp stuff */
/* GEN_DIST
*
* This function generates the Euclidean distance for points i
* and j in two chroma vectors for use with dynamic time warping of
* the chroma vectors.
*/
float gen_dist(int i, int j);
/* scorealign.cpp stuff: */
float map_time(float t1);
void midi_tempo_align(Alg_seq &seq , char *midiname, char *beatname);
void align_midi_to_audio(Alg_seq &seq, Audio_reader &reader,
bool verbose);
void align_midi_to_midi(Alg_seq &seq1, Alg_seq &seq2, bool verbose);
void align_audio_to_audio(Audio_reader &reader1,
Audio_reader &reader2, bool verbose);
void align_chromagrams(bool verbose);
int align_midi_to_audio(Alg_seq &seq, Audio_reader &reader);
int align_midi_to_midi(Alg_seq &seq0, Alg_seq &seq2);
int align_audio_to_audio(Audio_reader &reader1, Audio_reader &reader2);
int align_chromagrams();
int path_count; // for debug log formatting
void path_step(int i, int j);
void path_reverse();
int sec_to_pathy_index(float sec);
void compare_chroma(bool verbose);
int compare_chroma();
void linear_regression(int n, int width, float &a, float &b);
void compute_smooth_time_map();
void presmooth();
void compute_regression_lines();
void midi_tempo_align(Alg_seq &seq, bool verbose);
void midi_tempo_align(Alg_seq &seq);
};
#define DEBUG_LOG 0
//#define DEBUG_LOG 1
#if DEBUG_LOG
extern FILE *dbf;
#endif