From 74f764fec2d7a043af47837e492af46775a14028 Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@gmail.com>
Date: Tue, 30 Mar 2021 23:56:45 -0700
Subject: [PATCH 1/3] Processes only need steps for large speedup

I measured this as reducing CPU usage from ~78% to ~45%.  Almost
doubling the speed, which is expected as described below.

Tg processes the audio in a series of steps that double in size:  2, 4,
8, and then 16 seconds long.  This is the one to four dots shown in the
display.

Tg starts at step 1 (2 seconds) and goes up, stopping if a step doesn't
pass.  Data from smaller steps isn't used if a larger step passes.

This means when running at a constant step 4 with good signal, CPU usage
is almost double what it needs to be, since steps 1-3 are processed and
unused and add up to 14 seconds, almost as much as step 4's 16 seconds.

Change this to start at the previous iteration's step.  With good
signal, only step 4 will be processed and CPU usage is cut almost in
half.  If the previous step fails, it will try smaller steps.  If it
passes and is not yet the top step, it will try larger steps.

End result should end up on the same step as before, but get there
sooner.  It could be slower if the step drops a lot, e.g. from 4 to 0,
but this happens far less often than the step saying the same or nearly
the same.

To do this, I moved the step logic out of analyze_pa_data() and entirely
into compute_update().  analyze_pa_data() will now just processes one
step and compute_update() decides which step(s).  Previously,
analyze_pa_data() did all steps and compute_update() decided which step
to actually use.

There is a small change to the algorithm.  To be good, a step needs to
pass a number of changes done in process().  Then there is one more
check, of sigma vs period, done in compute_update().

Previously a step didn't need to pass the sigma check and it still
counted enough to increase last_tic and show up as a signal dot in the
display.  But the data wasn't actually used unless it passed to sigma
check too.

I no longer keep track of "partially" passing steps like this.  Either
it passes all checks, including the sigma check, or not.  last_tic and
signal level only count fully passing steps.

I see no practical difference in my tests, but I think it could show up
with some kind of marginal signal that has a high error in period
estimation with the longer steps.
---
 src/audio.c    | 29 ++++++++++-------------------
 src/computer.c | 45 ++++++++++++++++++++++++++++++++++++---------
 src/tg.h       |  6 +++++-
 3 files changed, 51 insertions(+), 29 deletions(-)

diff --git a/src/audio.c b/src/audio.c
index 05574fe..d8a86db 100644
--- a/src/audio.c
+++ b/src/audio.c
@@ -164,7 +164,7 @@ uint64_t get_timestamp(int light)
 	return ts;
 }
 
-static void fill_buffers(struct processing_buffers *ps, int light)
+void fill_buffers(struct processing_buffers *ps, int light)
 {
 	pthread_mutex_lock(&audio_mutex);
 	uint64_t ts = timestamp;
@@ -187,26 +187,17 @@ static void fill_buffers(struct processing_buffers *ps, int light)
 	}
 }
 
-int analyze_pa_data(struct processing_data *pd, int bph, double la, uint64_t events_from)
+/* Returns if buffer was processed ok */
+bool analyze_pa_data(struct processing_data *pd, int step, int bph, double la, uint64_t events_from)
 {
-	struct processing_buffers *p = pd->buffers;
-	fill_buffers(p, pd->is_light);
+	struct processing_buffers *p = &pd->buffers[step];
 
-	int i;
-	debug("\nSTART OF COMPUTATION CYCLE\n\n");
-	for(i=0; i<NSTEPS; i++) {
-		p[i].last_tic = pd->last_tic;
-		p[i].events_from = events_from;
-		process(&p[i], bph, la, pd->is_light);
-		if( !p[i].ready ) break;
-		debug("step %d : %f +- %f\n",i,p[i].period/p[i].sample_rate,p[i].sigma/p[i].sample_rate);
-	}
-	if(i) {
-		pd->last_tic = p[i-1].last_tic;
-		debug("%f +- %f\n",p[i-1].period/p[i-1].sample_rate,p[i-1].sigma/p[i-1].sample_rate);
-	} else
-		debug("---\n");
-	return i;
+	p->last_tic = pd->last_tic;
+	p->events_from = events_from;
+	process(p, bph, la, pd->is_light);
+	debug("step %d : %f +- %f\n", step, p->period/p->sample_rate, p->sigma/p->sample_rate);
+
+	return p->ready;
 }
 
 int analyze_pa_data_cal(struct processing_data *pd, struct calibration_data *cd)
diff --git a/src/computer.c b/src/computer.c
index 03942d3..938785e 100644
--- a/src/computer.c
+++ b/src/computer.c
@@ -91,19 +91,45 @@ static void compute_update_cal(struct computer *c)
 
 static void compute_update(struct computer *c)
 {
-	int signal = analyze_pa_data(c->pdata, c->actv->bph, c->actv->la, c->actv->events_from);
-	struct processing_buffers *p = c->pdata->buffers;
-	int i;
-	for(i=0; i<NSTEPS && p[i].ready; i++);
-	for(i--; i>=0 && p[i].sigma > p[i].period / 10000; i--);
-	if(i>=0) {
+	struct processing_data *pd = c->pdata;
+	struct processing_buffers *ps = pd->buffers;
+	int step = pd->last_step;
+
+	pd->last_step = 0;
+	/* Do all buffers at once so that all computation interval(s) use the
+	 * same data.  Buffers for some intervals will probably not be used, but
+	 * it's not expensive to fill them.  Processing is the slow part.  */
+	fill_buffers(ps, pd->is_light);
+
+	debug("\nSTART OF COMPUTATION CYCLE\n\n");
+	unsigned int stepmask = BITMASK(NSTEPS); // Mask of available steps
+	do {
+		stepmask &= ~BIT(step);
+		analyze_pa_data(c->pdata, step, c->actv->bph, c->actv->la, c->actv->events_from);
+
+		if (ps[step].ready && ps[step].sigma < ps[step].period / 10000) {
+			// Try next step if it's available
+			if (stepmask & BIT(step+1)) step++;
+		} else {
+			// This step didn't pass, try a lesser step
+			step--;
+		}
+	} while(step >= 0 && stepmask & BIT(step));
+
+	if (step >= 0) {
+		debug("%f +- %f\n", ps[step].period/ps[step].sample_rate, ps[step].sigma/ps[step].sample_rate);
+		pd->last_tic = ps[step].last_tic;
+		pd->last_step = step;
+
 		if(c->actv->pb) pb_destroy_clone(c->actv->pb);
-		c->actv->pb = pb_clone(&p[i]);
+		c->actv->pb = pb_clone(&ps[step]);
 		c->actv->is_old = 0;
-		c->actv->signal = i == NSTEPS-1 && p[i].amp < 0 ? signal-1 : signal;
+		/* Signal's range is 0 to NSTEPS, while step is -1 to NSTEPS-1, i.e. signal = step+1 */
+		c->actv->signal = step == NSTEPS-1 && ps[step].amp < 0 ? step : step+1;
 	} else {
+		debug("---\n");
 		c->actv->is_old = 1;
-		c->actv->signal = -signal;
+		c->actv->signal = 0;
 	}
 }
 
@@ -251,6 +277,7 @@ struct computer *start_computer(int nominal_sr, int bph, double la, int cal, int
 	pd->buffers = p;
 	pd->last_tic = 0;
 	pd->is_light = light;
+	pd->last_step = 0;
 
 	struct calibration_data *cd = malloc(sizeof(struct calibration_data));
 	setup_cal_data(cd);
diff --git a/src/tg.h b/src/tg.h
index 789f66c..04f3e11 100644
--- a/src/tg.h
+++ b/src/tg.h
@@ -75,6 +75,8 @@
 #endif
 
 #define UNUSED(X) (void)(X)
+#define BIT(n) (1u << (n))
+#define BITMASK(n) ((1u << (n)) - 1u)
 
 /* algo.c */
 struct processing_buffers {
@@ -122,13 +124,15 @@ int process_cal(struct processing_buffers *p, struct calibration_data *cd);
 struct processing_data {
 	struct processing_buffers *buffers;
 	uint64_t last_tic;
+	int last_step;	//!< Guess of step (buffers index) to try first, based on last iteration
 	int is_light;
 };
 
 int start_portaudio(int *nominal_sample_rate, double *real_sample_rate);
 int terminate_portaudio();
 uint64_t get_timestamp(int light);
-int analyze_pa_data(struct processing_data *pd, int bph, double la, uint64_t events_from);
+void fill_buffers(struct processing_buffers *ps, int light);
+bool analyze_pa_data(struct processing_data *pd, int step, int bph, double la, uint64_t events_from);
 int analyze_pa_data_cal(struct processing_data *pd, struct calibration_data *cd);
 void set_audio_light(bool light);
 

From 87e06226bc5b945ab3fa242ff6f6bc8fec7bbc23 Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@gmail.com>
Date: Thu, 1 Apr 2021 16:15:57 -0700
Subject: [PATCH 2/3] Draw partial signal dot for no amplitude

Previously max signal level (4 dots) but with no amplitude measured was
counted as signal level 3.  But level 3 or lower with no amplitude sill
counts as the same level.

Have compute_update() no longer do this signal level adjustment and just
report the level used, which indicates the averaging interval.

The dot graphic will now indicate "no amplitude" by using a hollow dot
for the final signal level's dot.

This way the number of dots always shows the averaging interval and a
hollow dot always shows that the signal is too poor to measure
amplitude.
---
 src/computer.c     |  2 +-
 src/output_panel.c | 24 +++++++++++++++++-------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/computer.c b/src/computer.c
index 938785e..3f86ea3 100644
--- a/src/computer.c
+++ b/src/computer.c
@@ -125,7 +125,7 @@ static void compute_update(struct computer *c)
 		c->actv->pb = pb_clone(&ps[step]);
 		c->actv->is_old = 0;
 		/* Signal's range is 0 to NSTEPS, while step is -1 to NSTEPS-1, i.e. signal = step+1 */
-		c->actv->signal = step == NSTEPS-1 && ps[step].amp < 0 ? step : step+1;
+		c->actv->signal = step+1;
 	} else {
 		debug("---\n");
 		c->actv->is_old = 1;
diff --git a/src/output_panel.c b/src/output_panel.c
index 0e7fc8b..835e4f8 100644
--- a/src/output_panel.c
+++ b/src/output_panel.c
@@ -112,11 +112,20 @@ static double amplitude_to_time(double lift_angle, double amp)
 	return asin(lift_angle / (2 * amp)) / M_PI;
 }
 
-static double draw_watch_icon(cairo_t *c, int signal, int happy, int light)
+/** Draw the watch graphic that has status info.
+ *
+ * @param[in,out] c Cairo context to use.
+ * @param signal Signal level, i.e. dots, 0 to NSTEPS inclusive.
+ * @param partial Specified signal level is only partially achieved.
+ * @param happy Green happy face or red frowny face.
+ * @param light Indicate light sampling mode.
+ * @return Y coodinate of top margin.
+ */
+
+static double draw_watch_icon(cairo_t *c, int signal, bool partial, bool happy, bool light)
 {
-	happy = !!happy;
-	cairo_set_line_width(c,3);
-	cairo_set_source(c,happy?green:red);
+	cairo_set_line_width(c, 3);
+	cairo_set_source(c, happy ? green : red);
 	cairo_move_to(c, OUTPUT_WINDOW_HEIGHT * 0.5, OUTPUT_WINDOW_HEIGHT * 0.5);
 	cairo_line_to(c, OUTPUT_WINDOW_HEIGHT * 0.75, OUTPUT_WINDOW_HEIGHT * (0.75 - 0.5*happy));
 	cairo_move_to(c, OUTPUT_WINDOW_HEIGHT * 0.5, OUTPUT_WINDOW_HEIGHT * 0.5);
@@ -126,7 +135,7 @@ static double draw_watch_icon(cairo_t *c, int signal, int happy, int light)
 	cairo_stroke(c);
 	int l = OUTPUT_WINDOW_HEIGHT * 0.8 / (2*NSTEPS - 1);
 	int i;
-	cairo_set_line_width(c,1);
+	cairo_set_line_width(c, 1);
 	for(i = 0; i < signal; i++) {
 		cairo_move_to(c, OUTPUT_WINDOW_HEIGHT + 0.5*l, OUTPUT_WINDOW_HEIGHT * 0.9 - 2*i*l);
 		cairo_line_to(c, OUTPUT_WINDOW_HEIGHT + 1.5*l, OUTPUT_WINDOW_HEIGHT * 0.9 - 2*i*l);
@@ -134,7 +143,7 @@ static double draw_watch_icon(cairo_t *c, int signal, int happy, int light)
 		cairo_line_to(c, OUTPUT_WINDOW_HEIGHT + 0.5*l, OUTPUT_WINDOW_HEIGHT * 0.9 - (2*i+1)*l);
 		cairo_line_to(c, OUTPUT_WINDOW_HEIGHT + 0.5*l, OUTPUT_WINDOW_HEIGHT * 0.9 - 2*i*l);
 		cairo_stroke_preserve(c);
-		cairo_fill(c);
+		if (i < signal-1 || !partial) cairo_fill(c);
 	}
 	if(light) {
 		int l = OUTPUT_WINDOW_HEIGHT * 0.15;
@@ -194,7 +203,8 @@ static gboolean output_draw_event(GtkWidget *widget, cairo_t *c, struct output_p
 	struct processing_buffers *p = snst->pb;
 	int old = snst->is_old;
 
-	double x = draw_watch_icon(c,snst->signal,snst->calibrate ? snst->signal==NSTEPS : snst->signal, snst->is_light);
+	double x = draw_watch_icon(c, snst->signal, snst->amp <= 0,
+				   snst->signal >= (snst->calibrate ? NSTEPS : 1), snst->is_light);
 
 	cairo_text_extents_t extents;
 

From cbd3a00c4328c28fbae4043cd14ce70f80bb2546 Mon Sep 17 00:00:00 2001
From: Trent Piepho <tpiepho@gmail.com>
Date: Sun, 18 Apr 2021 16:43:49 -0700
Subject: [PATCH 3/3] Change one sample sigma estimation to 0

When there is just one period in the processing buffer it is not
possible to calculate the sample standard deviation (sigma).  In this
case, the period value was being used as the sigma estimate, which
effectively gives a huge sigma when there is 1 period in the buffer.

This results in the processing buffer being rejected as bad and a larger
buffer, which would have multiple periods, is never tried.

One period per buffer would happen with a combination of long period and
short buffer, e.g. a small BPH in light mode, since light mode uses
buffers half as long as normal mode.

Use 0 as the sigma value for 1 sample.  This means the buffer will pass
the sigma check and a larger buffer will be tried.
---
 src/algo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/algo.c b/src/algo.c
index 1c744cb..fa52137 100644
--- a/src/algo.c
+++ b/src/algo.c
@@ -452,7 +452,7 @@ static int compute_period(struct processing_buffers *b, int bph)
 	if(count > 1)
 		b->sigma = sqrt((sq_sum - count * estimate * estimate)/ (count-1));
 	else
-		b->sigma = b->period;
+		b->sigma = 0;	// No std. dev. estimate possible with just 1 sample
 	return 0;
 }