--- orig/nyquist/nyqsrc/convolve.c	2015-05-04 12:41:01.497976900 -0500
+++ nyquist/nyqsrc/convolve.c	2015-05-04 12:40:32.047737200 -0500
@@ -6,34 +6,6 @@
  * of the first parameter.
  */
 
-/* Original convolve.c modified to do fast convolution. Here are some
- * notes:
- *    The first arg is arbitrary length. The second arg is the impulse
- * response, which is converted into a table. Tables have limited maximum
- * size, which is good because we're going to use a single FFT for the
- * whole impulse response.
- *
- * The fast convolution works like this: 
- *   inputs are x_snd and h_snd. 
- *   Make h_snd into a table ht of size N, where N is a power of 2. 
- *   Copy ht with zero fill into H of size 2N.
- *   Compute FFT of H in place.
- *   Iterate:
- *       Copy N samples of x_snd into X and zero fill to size 2N.
- *       Compute FFT of X in place.
- *       Multiply X by H (result goes into X).
- *       Compute IFFT of X in place
- *       Add X to R.
- *       Now N samples of R can be output.
- *       Copy 2nd half of R to first half and zero the 2nd half.
- *         (this is actually done first, and the first time does
- *          nothing because R is initially filled with zeros)
- *
- * Length of output is length of x input + length of h
- */
-
-#define _USE_MATH_DEFINES 1 /* for Visual C++ to get M_LN2 */
-#include <math.h>
 #include "stdio.h"
 #ifndef mips
 #include "stdlib.h"
@@ -43,8 +15,6 @@
 
 #include "falloc.h"
 #include "cext.h"
-#include "fftlib.h"
-#include "fftext.h"
 #include "convolve.h"
 
 void convolve_free();
@@ -58,13 +28,13 @@
     long x_snd_cnt;
     sample_block_values_type x_snd_ptr;
 
-    sample_type *H; // the FFT of h_snd
-    int h_len; // true length of H
-    int N; // length of block, FFTs are of size 2*N
-    int M; // log2 of 2*N, the FFT size
-    sample_type *X;
-    sample_type *R; // result buffer where output is summed
-    sample_type *R_current;
+    table_type table;
+    sample_type *h_buf;
+    double length_of_h;
+    long h_len;
+    long x_buf_len;
+    sample_type *x_buffer_pointer;
+    sample_type *x_buffer_current;
 } convolve_susp_node, *convolve_susp_type;
 
 
@@ -82,9 +52,8 @@
 }
 
 
-void convolve_s_fetch(snd_susp_type a_susp, snd_list_type snd_list)
+void convolve_s_fetch(register convolve_susp_type susp, snd_list_type snd_list)
 {
-    convolve_susp_type susp = (convolve_susp_type) a_susp;
     int cnt = 0; /* how many samples computed */
     int togo;
     int n;
@@ -93,9 +62,13 @@
 
     register sample_block_values_type out_ptr_reg;
 
-    sample_type *R = susp->R;
-    sample_type *R_current;
-    int N = susp->N;
+    register sample_type * h_buf_reg;
+    register long h_len_reg;
+    register long x_buf_len_reg;
+    register sample_type * x_buffer_pointer_reg;
+    register sample_type * x_buffer_current_reg;
+    register sample_type x_snd_scale_reg = susp->x_snd->scale;
+    register sample_block_values_type x_snd_ptr_reg;
     falloc_sample_block(out, "convolve_s_fetch");
     out_ptr = out->samples;
     snd_list->block = out;
@@ -104,60 +77,43 @@
         /* first compute how many samples to generate in inner loop: */
         /* don't overflow the output sample block: */
         togo = max_sample_block_len - cnt;
-        /* if we need output samples, generate them here */
-        if (susp->R_current >= R + N) {
-            /* Copy N samples of x_snd into X and zero fill to size 2N */
-            int i = 0;
-            sample_type *X = susp->X;
-            sample_type *H = susp->H;
-            int to_copy;
-            while (i < N) {
+
+	    /* don't run past the x_snd input sample block: */
+	    /* based on susp_check_term_log_samples, but offset by h_len */
+
+	    /* THIS IS EXPANDED BELOW
+	     * susp_check_term_log_samples(x_snd, x_snd_ptr, x_snd_cnt);
+	     */
                 if (susp->x_snd_cnt == 0) {
                     susp_get_samples(x_snd, x_snd_ptr, x_snd_cnt);
+	      
+	        /* THIS IS EXPANDED BELOW
+	         *logical_stop_test(x_snd, susp->x_snd_cnt);
+	         */
                     if (susp->x_snd->logical_stop_cnt == 
                         susp->x_snd->current - susp->x_snd_cnt) {
                         min_cnt(&susp->susp.log_stop_cnt, susp->x_snd, 
                                 (snd_susp_type) susp, susp->x_snd_cnt);
                     }
-                }
+
+	        /* THIS IS EXPANDED BELOW
+	         * terminate_test(x_snd_ptr, x_snd, susp->x_snd_cnt);
+	         */
                 if (susp->x_snd_ptr == zero_block->samples) {
+	            /* ### modify this to terminate at an offset of (susp->h_len) */
+                /* Note: in the min_cnt function, susp->x_snd_cnt is *subtracted*
+                 *   from susp->x_snd->current to form the terminate time, so to
+                 *   increase the time, we need to *subtract* susp->h_len, which
+                 *   due to the double negative, *adds* susp->h_len to the ultimate
+                 *   terminate time calculation.
+                 */
                     min_cnt(&susp->terminate_cnt, susp->x_snd,
-                            (snd_susp_type) susp, susp->x_snd_cnt);
-                    /* extend the output to include impulse response */
-                    susp->terminate_cnt += susp->h_len;
+			            (snd_susp_type) susp, susp->x_snd_cnt - susp->h_len);
                 }
-                /* copy no more than the remaining space and no more than
-                 * the amount remaining in the block
-                 */
-                to_copy = min(N - i, susp->x_snd_cnt);
-                memcpy(X + i, susp->x_snd_ptr, 
-                       to_copy * sizeof(*susp->x_snd_ptr));
-                susp->x_snd_ptr += to_copy;
-                susp->x_snd_cnt -= to_copy;
-                i += to_copy;
-            }
-            /* zero fill to size 2N */
-            memset(X + N, 0, N * sizeof(X[0]));
-            /* Compute FFT of X in place */
-            fftInit(susp->M);
-            rffts(X, susp->M, 1);
-            /* Multiply X by H (result goes into X) */
-            rspectprod(X, H, X, N * 2);
-            /* Compute IFFT of X in place */
-            riffts(X, susp->M, 1);
-            /* Shift R, zero fill, add X, all in one loop */
-            for (i = 0; i < N; i++) {
-                R[i] = R[i + N] + X[i];
-                R[i + N] = X[i + N];
-            }
-            /* now N samples of R can be output */
-            susp->R_current = R;
-        }
-        /* compute togo, the number of samples to "compute" */
-        /* can't use more than what's left in R. R_current is
-           the next sample of R, so what's left is N - (R - R_current) */
-        R_current = susp->R_current;
-        togo = min(togo, N - (R_current - R));
+	    }
+
+
+	    togo = min(togo, susp->x_snd_cnt);
 
         /* don't run past terminate time */
         if (susp->terminate_cnt != UNKNOWN &&
@@ -166,23 +122,69 @@
             if (togo == 0) break;
         }
 
+
         /* don't run past logical stop time */
-        if (!susp->logically_stopped &&
-            susp->susp.log_stop_cnt !=  UNKNOWN &&
-            susp->susp.log_stop_cnt <= susp->susp.current + cnt + togo) {
-            togo = susp->susp.log_stop_cnt - (susp->susp.current + cnt);
-            if (togo == 0) break;
+	    if (!susp->logically_stopped && susp->susp.log_stop_cnt != UNKNOWN) {
+	        int to_stop = susp->susp.log_stop_cnt - (susp->susp.current + cnt);
+	        /* break if to_stop == 0 (we're at the logical stop)
+	         * AND cnt > 0 (we're not at the beginning of the
+	         * output block).
+	         */
+	        if (to_stop < togo) {
+		    if (to_stop == 0) {
+		        if (cnt) {
+			    togo = 0;
+			    break;
+		        } else /* keep togo as is: since cnt == 0, we
+		                * can set the logical stop flag on this
+		                * output block
+		                */
+			    susp->logically_stopped = true;
+		    } else /* limit togo so we can start a new
+		            * block at the LST
+		            */
+		        togo = to_stop;
+	        }
         }       
 
         n = togo;
+	    h_buf_reg = susp->h_buf;
+	    h_len_reg = susp->h_len;
+	    x_buf_len_reg = susp->x_buf_len;
+	    x_buffer_pointer_reg = susp->x_buffer_pointer;
+	    x_buffer_current_reg = susp->x_buffer_current;
+	    x_snd_ptr_reg = susp->x_snd_ptr;
         out_ptr_reg = out_ptr;
         if (n) do { /* the inner sample computation loop */
-            *out_ptr_reg++ = (sample_type) *R_current++;
+            long i; double sum;
+            /* see if we've reached end of x_buffer */
+            if ((x_buffer_pointer_reg + x_buf_len_reg) <= (x_buffer_current_reg + h_len_reg)) {
+                /* shift x_buffer from current back to base */
+                for (i = 1; i < h_len_reg; i++) {
+                    x_buffer_pointer_reg[i-1] = x_buffer_current_reg[i];
+                }    
+                /* this will be incremented back to x_buffer_pointer_reg below */
+                x_buffer_current_reg = x_buffer_pointer_reg - 1;
+            }
+
+            x_buffer_current_reg++;
+
+            x_buffer_current_reg[h_len_reg - 1] = (x_snd_scale_reg * *x_snd_ptr_reg++);
+
+            sum = 0.0;
+            for (i = 0; i < h_len_reg; i++) {
+                sum += x_buffer_current_reg[i] * h_buf_reg[i];
+            }
+
+            *out_ptr_reg++ = (sample_type) sum;
 	} while (--n); /* inner loop */
 
-        /* using R_current is a bad idea on RS/6000: */
-        susp->R_current += togo;
+	    susp->x_buffer_pointer = x_buffer_pointer_reg;
+	    susp->x_buffer_current = x_buffer_current_reg;
+	    /* using x_snd_ptr_reg is a bad idea on RS/6000: */
+	    susp->x_snd_ptr += togo;
         out_ptr += togo;
+	    susp_took(x_snd_cnt, togo);
         cnt += togo;
     } /* outer loop */
 
@@ -202,9 +204,10 @@
 } /* convolve_s_fetch */
 
 
-void convolve_toss_fetch(snd_susp_type a_susp, snd_list_type snd_list)
+void convolve_toss_fetch(susp, snd_list)
+  register convolve_susp_type susp;
+  snd_list_type snd_list;
 {
-    convolve_susp_type susp = (convolve_susp_type) susp;
     time_type final_time = susp->susp.t0;
     long n;
 
@@ -219,36 +222,32 @@
     susp->x_snd_ptr += n;
     susp_took(x_snd_cnt, n);
     susp->susp.fetch = susp->susp.keep_fetch;
-    (*(susp->susp.fetch))(a_susp, snd_list);
+    (*(susp->susp.fetch))(susp, snd_list);
 }
 
 
-void convolve_mark(snd_susp_type a_susp)
+void convolve_mark(convolve_susp_type susp)
 {
-    convolve_susp_type susp = (convolve_susp_type) a_susp;
     sound_xlmark(susp->x_snd);
 }
 
 
-void convolve_free(snd_susp_type a_susp)
+void convolve_free(convolve_susp_type susp)
 {
-    convolve_susp_type susp = (convolve_susp_type) a_susp;
-    free(susp->R);
-    free(susp->X);
-    free(susp->H);
-    sound_unref(susp->x_snd);
+    table_unref(susp->table); 
+    free(susp->x_buffer_pointer);    sound_unref(susp->x_snd);
     ffree_generic(susp, sizeof(convolve_susp_node), "convolve_free");
 }
 
 
-void convolve_print_tree(snd_susp_type a_susp, int n)
+void convolve_print_tree(convolve_susp_type susp, int n)
 {
-    convolve_susp_type susp = (convolve_susp_type) a_susp;
     indent(n);
     stdputstr("x_snd:");
     sound_print_tree_1(susp->x_snd, n);
 }
 
+
 sound_type snd_make_convolve(sound_type x_snd, sound_type h_snd)
 {
     register convolve_susp_type susp;
@@ -256,38 +255,16 @@
     time_type t0 = x_snd->t0;
     sample_type scale_factor = 1.0F;
     time_type t0_min = t0;
-    table_type table;
-    double log_len;
     falloc_generic(susp, convolve_susp_node, "snd_make_convolve");
-    table = sound_to_table(h_snd);
-    susp->h_len = table->length;
-    log_len = log(table->length) / M_LN2; /* compute log-base-2(length) */
-    susp->M = (int) log_len;
-    if (susp->M != log_len) susp->M++; /* round up */
-    susp->N = 1 << susp->M; /* size of data blocks */
-    susp->M++; /* M = log2(2 * N) */
-    susp->H = (sample_type *) calloc(2 * susp->N, sizeof(susp->H[0]));
-    if (!susp->H) {
-        xlabort("memory allocation failure in convolve");
-    }
-    memcpy(susp->H, table->samples, sizeof(susp->H[0]) * susp->N);
-    table_unref(table); /* don't need table now */
-    /* remaining N samples are already zero-filled */
-    if (fftInit(susp->M)) {
-        free(susp->H);
-        xlabort("fft initialization error in convolve");
-    }
-    rffts(susp->H, susp->M, 1);
-    susp->X = (sample_type *) calloc(2 * susp->N, sizeof(susp->X[0]));
-    susp->R = (sample_type *) calloc(2 * susp->N, sizeof(susp->R[0]));
-    if (!susp->X || !susp->R) {
-        free(susp->H);
-        if (susp->X) free(susp->X);
-        if (susp->R) free(susp->R);
-        xlabort("memory allocation failed in convolve");
-    }
-    susp->R_current = susp->R + susp->N;
-    susp->susp.fetch = &convolve_s_fetch;
+    susp->table = sound_to_table(h_snd);
+    susp->h_buf = susp->table->samples;
+    susp->length_of_h = susp->table->length;
+    susp->h_len = (long) susp->length_of_h;
+         h_reverse(susp->h_buf, susp->h_len);
+    susp->x_buf_len = 2 * susp->h_len;
+    susp->x_buffer_pointer = calloc((2 * (susp->h_len)), sizeof(float));
+    susp->x_buffer_current = susp->x_buffer_pointer;
+    susp->susp.fetch = convolve_s_fetch;
     susp->terminate_cnt = UNKNOWN;
     /* handle unequal start times, if any */
     if (t0 < x_snd->t0) sound_prepend_zeros(x_snd, t0);