/* convolve.c -- implements (non-"fast") convolution */
/* 
 * Note: this code is mostly generated by translate.lsp (see convole.tran
 * in the tran directory), but it has been modified by hand to extend the
 * stop time to include the "tail" of the convolution beyond the length
 * of the first parameter.
 */

/* Original convolve.c modified to do fast convolution. Here are some
 * notes:
 *    The first arg is arbitrary length. The second arg is the impulse
 * response, which is converted into a table. The FFT size will be
 * limited to 64K, which allows convolution with up to 32K samples.
 * For longer impulse responses, we'll have to do convolutions one
 * 32K block at a time. I considered just limiting the convolution
 * size and handling longer impulse responses in Nyquist XLISP code,
 * but that would require taking FFT's of each input block multiple
 * times. Here, we save the FFT's and reuse them, which should gain
 * a factor of 2 in speed (we still have to inverse FFT each block
 * after multiplication, which should take 1/2 the time of doing
 * FFT/inverse-FFT on each block).
 *
 * The fast convolution works like this: 
 *   inputs are x_snd and h_snd. 
 *   Compute the length of h_snd in samples.
 *   Set fft_size = MAX_FFT_SIZE
 *   If length <= MAX_FFT_SIZE / 4 then
 *      set fft_size = (round length to power of 2) * 2
 *   set N = fft_size/2
 *   Set h_len = (length rounded up to multiple of fft_size/2) * 2
 *   Let L = h_len/ fft_size
 *   Allocate H of h_len floats
 *   Iterate over i from 0 to L-1:
 *       Copy ht with zero fill into H[i] of size fft_size,
 *         where each H[i] of size fft_size is filled with
 *         fft_size/2 samples (except for the last H[i])
 *       Compute FFT of H[i] in place (FFT size is fft_size)
 *   Allocate X of h_len floats. This represents the history
 *       of x_snd, which is initially all zero, so the FFT, X is all zero
 *   Allocate output buffers Y and R, each of size fft_size
 *   Iterate over j (i.e. run this to generate MAX_CONVOLVE_LEN
 *           samples; then j = (j + 1) mod L.
 *       Copy 2nd half of R to first half and zero the 2nd half.
 *         Note: the first time does nothing because R is initially 
 *               filled with zeros
 *       Copy fft_size/2 samples of x_snd into X[j],
 *         where X[j] is of size fft_size and filled with
 *         N samples (except when x_snd terminates)
 *       Zero fill X[j]
 *       Compute FFT of X[j] in place.
 *       Iterate k = 0 to L-1
 *           Multiply X[(j-k) mod L] by H[k] (result goes into Y).
 *           Compute IFFT of Y in place. Y is now time domain convolution
 *               of two blocks of samples.
 *           Add Y to R.
 *       Now N samples of R can be output.
 * For simplicity, we'll keep processing x_snd input even after x_snd
 * terminates. This will avoid special cases where we do not need all
 * of X[j] at the end of the convolution.
 *
 * Length of output is length of x input + length of h
 */

// You can turn on debugging output with: #define D if (1)
#define D if (0)

#define MAX_IR_LEN 4000000 /* maximum impulse response length */
#define MAX_LOG_FFT_SIZE 16 /* maximum fft size for convolution */
//#define MAX_LOG_FFT_SIZE 4 /* maximum fft size for convolution */
#define _USE_MATH_DEFINES 1 /* for Visual C++ to get M_LN2 */
#include <assert.h>
#include <math.h>
#include "stdio.h"
#ifndef mips
#include "stdlib.h"
#endif
#include "xlisp.h"
#include "sound.h"
#include "samples.h"

#include "falloc.h"
#include "cext.h"
#include "fftlib.h"
#include "fftext.h"
#include "convolve.h"

void convolve_free(snd_susp_type a_susp);


typedef struct convolve_susp_struct {
    snd_susp_node susp;
    int64_t terminate_cnt;
    boolean know_end_of_x;
    boolean logically_stopped;
    sound_type x_snd;
    int x_snd_cnt;
    sample_block_values_type x_snd_ptr;

    sample_type *X; // the FFTs of x_snd
    int j; // which block are we processing? 0 <= j < L
    sample_type *H; // the FFTs of h_snd
    sample_type *Y; // product of X*H where we inverse FFT
    int h_snd_len; // true length of h_snd in samples
    int N; // length of convolution, FFTs are of size 2*N
    int M; // log2 of 2*N, the FFT size
    int L; // number of blocks: h_len / (2*N)
    sample_type *R; // result buffer where output is summed
    sample_type *R_current; // pointer to next sample to output
} convolve_susp_node, *convolve_susp_type;

/*
void h_reverse(sample_type *h, long len)
{
    sample_type temp;
    int i;
    
    for (i = 0; i < len; i++) {
        temp = h[i];
        h[i] = h[len - 1];
        h[len - 1] = temp;
        len--;
    }
}
*/

void convolve_s_fetch(snd_susp_type a_susp, snd_list_type snd_list)
{
    convolve_susp_type susp = (convolve_susp_type) a_susp;
    int cnt = 0; /* how many samples computed */
    int togo;
    int n;
    sample_block_type out;
    register sample_block_values_type out_ptr;

    register sample_block_values_type out_ptr_reg;

    sample_type *R = susp->R;
    sample_type *R_current;
    int N = susp->N;
    falloc_sample_block(out, "convolve_s_fetch");
    out_ptr = out->samples;
    snd_list->block = out;

    while (cnt < max_sample_block_len) { /* outer loop */
        /* first compute how many samples to generate in inner loop: */
        /* don't overflow the output sample block: */
        togo = max_sample_block_len - cnt;
        /* if we need output samples, generate them here */
        D printf("test R_current at offset %td\n", susp->R_current - R);
        if (susp->R_current >= R + N) { // true when we output half of R
            int i = 0;
            int k;
            sample_type *Xj = susp->X + susp->j * N * 2;
            sample_type *H = susp->H;
            sample_type *Y = susp->Y;
            int to_copy;
            /* Shift R, zero fill: */
            memcpy(R, R + N, N * sizeof(*R));
            memset(R + N, 0, N * sizeof(*R));
            /* Copy N samples of x_snd into Xj and zero fill to size 2N */
            D printf("Copying N samples of x_snd into Xj at offset %td\n", Xj - susp->X);
            while (i < N) {
                if (susp->x_snd_cnt == 0) {
                    susp_get_samples(x_snd, x_snd_ptr, x_snd_cnt);
                    if (susp->x_snd->logical_stop_cnt == 
                        susp->x_snd->current - susp->x_snd_cnt) {
                        min_cnt(&susp->susp.log_stop_cnt, susp->x_snd, 
                                (snd_susp_type) susp, susp->x_snd_cnt);
                    }
                }
                /* This code is not standard. Since we extend the terminate
                 * count by susp->h_snd_len, the "standard" call to min_cnt()
                 * results in extending the terminate time forever. Instead,
                 * we make this code run once only by setting know_end_of_x.
                 */                   
                if (!susp->know_end_of_x && 
                    susp->x_snd_ptr == zero_block->samples) {
                    susp->terminate_cnt = susp->x_snd->current - susp->x_snd_cnt;
                    /* extend the output to include impulse response */
                    susp->terminate_cnt += susp->h_snd_len;
                    susp->know_end_of_x = TRUE;
                }
                /* copy no more than the remaining space and no more than
                 * the amount remaining in the block
                 */
                to_copy = min(N - i, susp->x_snd_cnt);
                memcpy(Xj + i, susp->x_snd_ptr, 
                       to_copy * sizeof(*susp->x_snd_ptr));
                susp->x_snd_ptr += to_copy;
                susp->x_snd_cnt -= to_copy;
                i += to_copy;
            }
            /* zero fill to size 2N */
            memset(Xj + N, 0, N * sizeof(Xj[0]));
            D {
                printf("Xj at offset %td: ", Xj - susp->X);
                printf("    %d samples ", susp->N * 2);
                float big = 0.0;
                for (i = 0; i < susp->N * 2; i++) {
                    // printf("%g ", Xj[i]);
                    big = max(big, fabs(Xj[i]));
                }
                printf("MAX: %g\n", big);
            }
            /* Compute FFT of Xj in place */
            fftInit(susp->M);
            rffts(Xj, susp->M, 1);
            /* convolve pairs of blocks and sum into Y */
            memset(Y, 0, N * sizeof(*Y)); /* initialize sum to zero */
            for (k = 0; k < susp->L; k++) {
                /* Multiply Xj by H (result goes into X) */
                sample_type *X = susp->X + ((susp->L + susp->j - k) % susp->L) * N * 2;
                rspectprod(X, H + k * N * 2, Y, N * 2);
                /* Compute IFFT of Y in place */
                riffts(Y, susp->M, 1);
                /* R += Y */
                D { printf("Output block %d, X offset %td: ", k, X - susp->X);
                    printf(" %d samples ", 2 * N);
                    float big = 0.0;
                    for (i = 0; i < 2 * N; i++) {
                        big = max(big, fabs(Y[i]));
                    }
                    printf("MAX: %g\n", big);
                }
                for (i = 0; i < 2 * N; i++) {
                    R[i] += Y[i];
                }
            }
            /* now N samples of R can be output */
            susp->R_current = R;
            D printf("R: %d samples ", susp->N);
            D { float big = 0.0;
                for (i = 0; i < susp->N; i++) {
                    // printf("%g ", R[i]);
                    big = max(big, fabs(R[i]));
                }
                printf("MAX: %g\n", big);
            }
            susp->j = (susp->j + 1) % susp->L;
        }
        /* compute togo, the number of samples to "compute" */
        /* can't use more than what's left in R. R_current is
           the next sample of R, so what's left is N - (R - R_current) */
        R_current = susp->R_current;
        togo = (int) min(togo, N - (R_current - R));
        
        /* don't run past terminate time */
        if (susp->terminate_cnt != UNKNOWN &&
            susp->terminate_cnt <= susp->susp.current + cnt + togo) {
            togo = (int) (susp->terminate_cnt - (susp->susp.current + cnt));
            if (togo == 0) break;
        }

        /* don't run past logical stop time */
        if (!susp->logically_stopped &&
            susp->susp.log_stop_cnt !=  UNKNOWN &&
            susp->susp.log_stop_cnt <= susp->susp.current + cnt + togo) {
            togo = (int) (susp->susp.log_stop_cnt - (susp->susp.current + cnt));
            D printf("susp->susp.log_stop_cnt is set to %" PRId64 "\n",
		     susp->susp.log_stop_cnt);
            if (togo == 0) break;
        }       

        n = togo;
        out_ptr_reg = out_ptr;
        if (n) do { /* the inner sample computation loop */
            *out_ptr_reg++ = (sample_type) *R_current++;
        } while (--n); /* inner loop */

        /* using R_current is a bad idea on RS/6000: */
        susp->R_current += togo;
        out_ptr += togo;
        cnt += togo;
    } /* outer loop */

    /* test for termination */
    if (togo == 0 && cnt == 0) {
        snd_list_terminate(snd_list);
    } else {
        snd_list->block_len = cnt;
        susp->susp.current += cnt;
    }
    /* test for logical stop */
    if (susp->logically_stopped) {
        snd_list->logically_stopped = true;
    } else if (susp->susp.log_stop_cnt == susp->susp.current) {
        susp->logically_stopped = true;
    }
} /* convolve_s_fetch */


void convolve_toss_fetch(snd_susp_type a_susp, snd_list_type snd_list)
{
    convolve_susp_type susp = (convolve_susp_type) a_susp;
    time_type final_time = susp->susp.t0;
    long n;

    /* fetch samples from x_snd up to final_time for this block of zeros */
    while ((ROUNDBIG((final_time - susp->x_snd->t0) * susp->x_snd->sr)) >=
	   susp->x_snd->current)
	susp_get_samples(x_snd, x_snd_ptr, x_snd_cnt);
    /* convert to normal processing when we hit final_count */
    /* we want each signal positioned at final_time */
    n = (long) ROUNDBIG((final_time - susp->x_snd->t0) * susp->x_snd->sr -
                        (susp->x_snd->current - susp->x_snd_cnt));
    susp->x_snd_ptr += n;
    susp_took(x_snd_cnt, n);
    susp->susp.fetch = susp->susp.keep_fetch;
    (*(susp->susp.fetch))(a_susp, snd_list);
}


void convolve_mark(snd_susp_type a_susp)
{
    convolve_susp_type susp = (convolve_susp_type) a_susp;
    sound_xlmark(susp->x_snd);
}


void convolve_free(snd_susp_type a_susp)
{
    convolve_susp_type susp = (convolve_susp_type) a_susp;
    free(susp->R);
    free(susp->X);
    free(susp->Y);
    free(susp->H);
    sound_unref(susp->x_snd);
    ffree_generic(susp, sizeof(convolve_susp_node), "convolve_free");
}


void convolve_print_tree(snd_susp_type a_susp, int n)
{
    convolve_susp_type susp = (convolve_susp_type) a_susp;
    indent(n);
    stdputstr("x_snd:");
    sound_print_tree_1(susp->x_snd, n);
}

void fill_with_samples(sample_type *x, sound_type s, long n)
{
/* this is based on snd_fetch in samples.c */
#define CNT extra[1]
#define INDEX extra[2]
#define FIELDS 3
#define SAMPLES list->block->samples
    int i;
    for (i = 0; i < n; i++) {
        if (!s->extra) { /* this is the first call, so fix up s */
            s->extra = (int64_t *) malloc(sizeof(s->extra[0]) * FIELDS);
            s->extra[0] = sizeof(s->extra[0]) * FIELDS;
            s->CNT = s->INDEX = 0;
        }
        int icnt = (int) s->CNT;  /* need this to be int type */
        assert(icnt >= 0);
        if (icnt == s->INDEX) {
            sound_get_next(s, &icnt);
            assert(icnt >= 0);
            s->CNT = icnt;  /* save the count back into s->extra */
            s->INDEX = 0;
        }
        x[i] = s->SAMPLES[s->INDEX++] * s->scale;
        assert(x[i] < 2);
    }
    D { float big = 0.0;
        for (i = 0; i < n; i++) {
            big = max(big, fabs(x[i]));
            assert(big < 2);
        }
        printf("fill_with_samples n %ld scale %g max %g\n", n, s->scale, big);
    }
}


sound_type snd_make_convolve(sound_type x_snd, sound_type h_snd)
{
    register convolve_susp_type susp;
    rate_type sr = x_snd->sr;
    time_type t0 = x_snd->t0;
    sample_type scale_factor = 1.0F;
    time_type t0_min = t0;
    int64_t h_len;
    int i;
    // assume fft_size is maximal. We fix this later if it is wrong
    long fft_size = 1 << MAX_LOG_FFT_SIZE;
    if (sr != h_snd->sr) {
        xlfail("convolve requires both inputs to have the same sample rates");
    }
    falloc_generic(susp, convolve_susp_node, "snd_make_convolve");
    /* compute the length of h_snd in samples */
    h_len = snd_length(h_snd, MAX_IR_LEN + 1);
    if (h_len > MAX_IR_LEN) {
        char emsg[100];
        sprintf(emsg, "convolve maximum impulse length is %d", MAX_IR_LEN);
        xlfail(emsg);
    }
    /* len is the impulse response length; 
     * the FFT size is at least double that */
    if (h_len <= fft_size / 4) {
        /* compute log-base-2(h_len): */;
        double log_len = log((double) h_len) / M_LN2;
        int log_len_int = (int) log_len;
        if (log_len_int != log_len) log_len_int++; /* round up to power of 2 */
        susp->M = log_len_int + 1;
    } else {
        susp->M = MAX_LOG_FFT_SIZE;
    }
    fft_size = (1 << susp->M);
    D printf("fft_size %ld\n", fft_size);
    susp->N = fft_size / 2;
    // round h_len up to multiple of susp->N and multiply by 2
    susp->h_snd_len = (int) h_len;
    h_len = ((h_len + susp->N - 1) / susp->N) * susp->N * 2;
    susp->L = (int) (h_len / fft_size);
    // allocate memory
    susp->H = (sample_type *) calloc((size_t) h_len, sizeof(susp->H[0]));
    if (!susp->H) {
        xlfail("memory allocation failure in convolve");
    }
    for (i = 0; i < susp->L; i++) {
        /* copy fft_size/2 samples into each H[i] */
        fill_with_samples(susp->H + i * susp->N * 2, h_snd, susp->N);
    }
    for (i = 0; i < susp->L; i++) {
        int j;
        float *H = susp->H + i * susp->N * 2;
        D { printf("H_%d at %td: ", i, H - susp->H);
            printf("%d samples ", susp->N * 2);
            float big = 0.0;
            for (j = 0; j < susp->N * 2; j++) {
                big = max(big, fabs(H[j]));
                assert(big < 2);
                // printf("%g ", H[j]);
            }
            printf("big %g\n", big);
        }
    }
    sound_unref(h_snd);
    h_snd = NULL;
    /* remaining N samples are already zero-filled */
    if (fftInit(susp->M)) {
        free(susp->H);
        xlfail("fft initialization error in convolve");
    }
    /* take the FFT of each block of the impulse response */
    for (i = 0; i < susp->L; i++) {
        rffts(susp->H + i * susp->N * 2, susp->M, 1);
    }
    susp->X = (sample_type *) calloc((size_t) h_len, sizeof(susp->X[0]));
    susp->R = (sample_type *) calloc(fft_size, sizeof(susp->R[0]));
    susp->Y = (sample_type *) calloc(fft_size, sizeof(susp->Y[0]));
    if (!susp->X || !susp->R || !susp->Y) {
        free(susp->H);
        if (susp->X) free(susp->X);
        if (susp->R) free(susp->R);
        if (susp->Y) free(susp->Y);
        xlfail("memory allocation failed in convolve");
    }
    susp->R_current = susp->R + susp->N;
    susp->susp.fetch = &convolve_s_fetch;
    susp->terminate_cnt = UNKNOWN;
    susp->know_end_of_x = FALSE;
    /* handle unequal start times, if any */
    if (t0 < x_snd->t0) sound_prepend_zeros(x_snd, t0);
    /* minimum start time over all inputs: */
    t0_min = min(x_snd->t0, t0);
    /* how many samples to toss before t0: */
    susp->susp.toss_cnt = (long) ((t0 - t0_min) * sr + 0.5);
    if (susp->susp.toss_cnt > 0) {
	susp->susp.keep_fetch = susp->susp.fetch;
	susp->susp.fetch = convolve_toss_fetch;
    }

    /* initialize susp state */
    susp->susp.free = convolve_free;
    susp->susp.sr = sr;
    susp->susp.t0 = t0;
    susp->susp.mark = convolve_mark;
    susp->susp.print_tree = convolve_print_tree;
    susp->susp.name = "convolve";
    susp->logically_stopped = false;
    susp->susp.log_stop_cnt = logical_stop_cnt_cvt(x_snd);
    susp->susp.current = 0;
    susp->x_snd = x_snd;
    susp->x_snd_cnt = 0;
    susp->j = 0;
    return sound_create((snd_susp_type)susp, t0, sr, scale_factor);
}


sound_type snd_convolve(sound_type x_snd, sound_type h_snd)
{
    sound_type x_snd_copy = sound_copy(x_snd);
    sound_type h_snd_copy = sound_copy(h_snd);
    return snd_make_convolve(x_snd_copy, h_snd_copy);
}