From 2b36f95de358caf8a8c499ee5fe18da9d945109c Mon Sep 17 00:00:00 2001 From: vijk777 Date: Tue, 27 Jan 2026 10:26:43 -0800 Subject: [PATCH] fix: save profile after 5 epochs --- src/LatentEvolution/latent_stag.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/LatentEvolution/latent_stag.py b/src/LatentEvolution/latent_stag.py index 959a185..6d8ac23 100644 --- a/src/LatentEvolution/latent_stag.py +++ b/src/LatentEvolution/latent_stag.py @@ -397,10 +397,13 @@ def handle_sigusr2(signum, frame): # epoch loop for epoch in range(cfg.training.epochs): - # stop profiler after first N epochs to limit file size + # stop profiler after first N epochs and save trace immediately if epoch == profile_first_n_epochs and profiler.is_enabled(): profiler.stop() - print(f"profiler stopped after epoch {epoch} (profiled first {profile_first_n_epochs} epochs)") + trace_path = run_dir / "pipeline_trace.json" + profiler.save(trace_path) + print(f"profiler stopped after epoch {epoch}, saved to {trace_path}") + profiler.print_stats() with profiler.event("epoch", "training", thread="main", epoch=epoch): epoch_start = datetime.now() @@ -557,12 +560,13 @@ def handle_sigusr2(signum, frame): chunk_loader.cleanup() - # save profiler trace and stats - profiler.stop() - trace_path = run_dir / "pipeline_trace.json" - profiler.save(trace_path) - print(f"saved pipeline trace to {trace_path}") - profiler.print_stats() + # save profiler if still running (epochs < profile_first_n_epochs) + if profiler.is_enabled(): + profiler.stop() + trace_path = run_dir / "pipeline_trace.json" + profiler.save(trace_path) + print(f"saved pipeline trace to {trace_path}") + profiler.print_stats() writer.close() print("training complete")