Skip to content
Merged
1 change: 1 addition & 0 deletions .bazelversion
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
8.5.1
20 changes: 18 additions & 2 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,11 @@ cc_library(
"vendor/genplusgx/cd_hw",
"vendor/genplusgx/debug",
],
# Link math library on Linux (handled automatically on other platforms)
linkopts = ["-lm"],
# Link math library on Linux (not needed on macOS where it's in libSystem)
linkopts = select({
"@platforms//os:linux": ["-lm"],
"//conditions:default": [],
}),
)

# gxtest C++ wrapper library
Expand Down Expand Up @@ -129,3 +132,16 @@ cc_test(
"@googletest//:gtest_main",
],
)

# Profiler test
cc_test(
name = "gxtest_profiler",
srcs = [
"tests/profiler_test.cpp",
"tests/prime_sieve_rom.h",
],
deps = [
":gxtest",
"@googletest//:gtest_main",
],
)
24 changes: 23 additions & 1 deletion include/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
* GX::Profiler profiler;
* profiler.AddFunction(0x001000, 0x001100, "generate_moves");
* profiler.AddFunction(0x001100, 0x001200, "score_move");
* // Or load from ELF: profiler.LoadSymbols("game.elf");
* // Or load from ELF: profiler.LoadSymbolsFromELF("game.elf");
*
* profiler.Start();
* emu.RunFrames(1000);
Expand Down Expand Up @@ -37,6 +37,14 @@ enum class ProfileMode {
CallStack // Tracks call stack for inclusive cycle counts
};

/**
* Profiling options
*/
struct ProfileOptions {
ProfileMode mode = ProfileMode::Simple;
uint32_t sample_rate = 1; // 1 = every instruction, N = every Nth instruction
};

/**
* Statistics for a single function
*/
Expand Down Expand Up @@ -126,6 +134,12 @@ class Profiler {
*/
void Start(ProfileMode mode = ProfileMode::Simple);

/**
* Start profiling with options
* @param options ProfileOptions struct with mode and sample_rate
*/
void Start(const ProfileOptions& options);

/**
* Stop profiling - removes the cpu_hook callback
*/
Expand Down Expand Up @@ -162,6 +176,11 @@ class Profiler {
*/
uint64_t GetTotalCycles() const { return total_cycles_; }

/**
* Get current sample rate (1 = every instruction)
*/
uint32_t GetSampleRate() const { return sample_rate_; }

/**
* Print a formatted profile report
* @param out Output stream
Expand Down Expand Up @@ -198,6 +217,9 @@ class Profiler {
uint32_t last_pc_ = 0;
int64_t last_cycles_ = 0;
uint64_t total_cycles_ = 0;
uint32_t sample_rate_ = 1;
uint32_t sample_counter_ = 0;
int64_t pending_cycles_ = 0; // Accumulated cycles since last sample (for sampling mode)
};

/** Global profiler instance (needed for cpu_hook callback) */
Expand Down
4 changes: 4 additions & 0 deletions roms/prime_sieve/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ typedef unsigned int uint32_t;
/**
* Initialize the sieve array to all zeros (0 = potentially prime)
*/
__attribute__((noinline))
static void clear_sieve(void)
{
for (int i = 0; i < SIEVE_SIZE; i++) {
Expand All @@ -40,6 +41,7 @@ static void clear_sieve(void)
/**
* Mark 0 and 1 as composite (not prime)
*/
__attribute__((noinline))
static void mark_trivial_composites(void)
{
SIEVE_ARRAY[0] = 1; /* 0 is not prime */
Expand All @@ -50,6 +52,7 @@ static void mark_trivial_composites(void)
* Run the Sieve of Eratosthenes algorithm
* For each prime p, mark all multiples of p as composite
*/
__attribute__((noinline))
static void run_sieve(void)
{
/* Only need to check up to sqrt(SIEVE_SIZE) ≈ 24 */
Expand All @@ -69,6 +72,7 @@ static void run_sieve(void)
/**
* Collect the first NUM_PRIMES primes from the sieve into the results array
*/
__attribute__((noinline))
static void collect_primes(void)
{
int count = 0;
Expand Down
32 changes: 17 additions & 15 deletions roms/prime_sieve/prime_sieve_rom.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
namespace GX {
namespace TestRoms {

// Prime Sieve ROM (676 bytes)
// Prime Sieve ROM (706 bytes)
// Computes first 100 primes using Sieve of Eratosthenes
// Results written to:
// $FF0300-$FF03C7: Prime values (100 x 16-bit words)
// $FF0500: Prime count
// $FF0502: Done flag ($DEAD when complete)

constexpr size_t PRIME_SIEVE_ROM_SIZE = 676;
constexpr size_t PRIME_SIEVE_ROM_SIZE = 706;

constexpr uint8_t PRIME_SIEVE_ROM[] = {
0x00, 0xff, 0xfe, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00,
Expand Down Expand Up @@ -63,20 +63,22 @@ constexpr uint8_t PRIME_SIEVE_ROM[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4f, 0xf9, 0x00, 0xff,
0xfe, 0x00, 0x4e, 0xb9, 0x00, 0x00, 0x02, 0x10, 0x60, 0xfe, 0x00, 0x00,
0xfe, 0x00, 0x4e, 0xb9, 0x00, 0x00, 0x02, 0xa0, 0x60, 0xfe, 0x00, 0x00,
0x20, 0x7c, 0x00, 0xff, 0x00, 0x00, 0x10, 0xfc, 0x00, 0x00, 0xb1, 0xfc,
0x00, 0xff, 0x02, 0x58, 0x66, 0xf4, 0x13, 0xfc, 0x00, 0x01, 0x00, 0xff,
0x00, 0x00, 0x13, 0xfc, 0x00, 0x01, 0x00, 0xff, 0x00, 0x01, 0x72, 0x02,
0x20, 0x41, 0xd1, 0xfc, 0x00, 0xff, 0x00, 0x00, 0x10, 0x10, 0x67, 0x46,
0x52, 0x81, 0x70, 0x19, 0xb0, 0x81, 0x66, 0xec, 0x22, 0x7c, 0x00, 0xff,
0x00, 0x02, 0x72, 0x00, 0x10, 0x11, 0x66, 0x12, 0x70, 0x00, 0x30, 0x01,
0xd0, 0x80, 0x20, 0x40, 0xd1, 0xfc, 0x00, 0xff, 0x03, 0x00, 0x30, 0x89,
0x52, 0x81, 0xb3, 0xfc, 0x00, 0xff, 0x02, 0x57, 0x67, 0x08, 0x52, 0x89,
0x70, 0x64, 0xb0, 0x81, 0x66, 0xda, 0x33, 0xc1, 0x00, 0xff, 0x05, 0x00,
0x33, 0xfc, 0xde, 0xad, 0x00, 0xff, 0x05, 0x02, 0x4e, 0x75, 0x70, 0x00,
0x30, 0x01, 0xd0, 0x80, 0x20, 0x40, 0xd1, 0xfc, 0x00, 0xff, 0x00, 0x00,
0x10, 0xbc, 0x00, 0x01, 0xd0, 0x81, 0x0c, 0x80, 0x00, 0x00, 0x02, 0x57,
0x6f, 0xea, 0x60, 0x9c
0x00, 0xff, 0x02, 0x58, 0x66, 0xf4, 0x4e, 0x75, 0x13, 0xfc, 0x00, 0x01,
0x00, 0xff, 0x00, 0x00, 0x13, 0xfc, 0x00, 0x01, 0x00, 0xff, 0x00, 0x01,
0x4e, 0x75, 0x72, 0x02, 0x20, 0x41, 0xd1, 0xfc, 0x00, 0xff, 0x00, 0x00,
0x10, 0x10, 0x66, 0x1c, 0x70, 0x00, 0x30, 0x01, 0xd0, 0x80, 0x20, 0x40,
0xd1, 0xfc, 0x00, 0xff, 0x00, 0x00, 0x10, 0xbc, 0x00, 0x01, 0xd0, 0x81,
0x0c, 0x80, 0x00, 0x00, 0x02, 0x57, 0x6f, 0xea, 0x52, 0x81, 0x70, 0x19,
0xb0, 0x81, 0x66, 0xd0, 0x4e, 0x75, 0x22, 0x7c, 0x00, 0xff, 0x00, 0x02,
0x72, 0x00, 0x10, 0x11, 0x66, 0x12, 0x70, 0x00, 0x30, 0x01, 0xd0, 0x80,
0x20, 0x40, 0xd1, 0xfc, 0x00, 0xff, 0x03, 0x00, 0x30, 0x89, 0x52, 0x81,
0xb3, 0xfc, 0x00, 0xff, 0x02, 0x57, 0x67, 0x08, 0x52, 0x89, 0x70, 0x64,
0xb0, 0x81, 0x66, 0xda, 0x33, 0xc1, 0x00, 0xff, 0x05, 0x00, 0x4e, 0x75,
0x4e, 0xb9, 0x00, 0x00, 0x02, 0x10, 0x4e, 0xb9, 0x00, 0x00, 0x02, 0x24,
0x4e, 0xb9, 0x00, 0x00, 0x02, 0x36, 0x4e, 0xb9, 0x00, 0x00, 0x02, 0x6a,
0x33, 0xfc, 0xde, 0xad, 0x00, 0xff, 0x05, 0x02, 0x4e, 0x75
};

// First 100 prime numbers for verification
Expand Down
83 changes: 75 additions & 8 deletions src/profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ Profiler::~Profiler() {
}

void Profiler::AddFunction(uint32_t start_addr, uint32_t end_addr, const std::string& name) {
// Validate function range
if (end_addr <= start_addr) {
return; // Invalid range, ignore
}

FunctionDef func = {start_addr, end_addr, name};

// Insert in sorted order by start_addr
Expand All @@ -60,7 +65,17 @@ void Profiler::AddFunction(uint32_t start_addr, uint32_t end_addr, const std::st
int Profiler::LoadSymbolsFromELF(const std::string& elf_path) {
// Use nm to extract symbols
// Format: "address type name"
std::string cmd = "nm -S --defined-only " + elf_path + " 2>/dev/null";
// Shell-quote the path to prevent command injection
std::string quoted_path = "'";
for (char c : elf_path) {
if (c == '\'') {
quoted_path += "'\\''"; // End quote, escaped quote, start quote
} else {
quoted_path += c;
}
}
quoted_path += "'";
std::string cmd = "nm -S --defined-only " + quoted_path + " 2>/dev/null";
FILE* pipe = popen(cmd.c_str(), "r");
if (!pipe) {
return -1;
Expand All @@ -73,17 +88,21 @@ int Profiler::LoadSymbolsFromELF(const std::string& elf_path) {
char type;
char name[256];

// Parse: "address size type name" (with size) or "address type name" (without)
// Parse nm -S output: "hex_address hex_size type name" (both in hex, with size) or "hex_address type name" (without size)
if (sscanf(line, "%x %x %c %255s", &addr, &size, &type, name) == 4) {
// Has size - use it
if (type == 'T' || type == 't') { // Text (code) symbols only
AddFunction(addr, addr + size, name);
// Guard against overflow
uint32_t end_addr = (size <= UINT32_MAX - addr) ? addr + size : UINT32_MAX;
AddFunction(addr, end_addr, name);
count++;
}
} else if (sscanf(line, "%x %c %255s", &addr, &type, name) == 3) {
// No size - estimate from next symbol (done after loading all)
if (type == 'T' || type == 't') {
AddFunction(addr, addr + 0x100, name); // Default 256 bytes
// Guard against overflow (0x100 = 256 byte default)
uint32_t end_addr = (addr <= UINT32_MAX - 0x100) ? addr + 0x100 : UINT32_MAX;
AddFunction(addr, end_addr, name);
count++;
}
}
Expand All @@ -102,6 +121,16 @@ int Profiler::LoadSymbolsFromELF(const std::string& elf_path) {
}

int Profiler::LoadSymbolsFromFile(const std::string& path) {
// Load symbols from a simple text file format:
// <hex_address> <decimal_size> <name>
// Example:
// 00000200 16 _start
// 00000210 94 main
//
// Parsing note:
// - This function parses the address as hex (%x) and the size as decimal (%u).
// - This intentionally differs from LoadSymbolsFromELF (nm output), which uses
// hex for both address and size (%x for each).
std::ifstream file(path);
if (!file) {
return -1;
Expand All @@ -114,7 +143,9 @@ int Profiler::LoadSymbolsFromFile(const std::string& path) {
char name[256];

if (sscanf(line.c_str(), "%x %u %255s", &addr, &size, name) == 3) {
AddFunction(addr, addr + size, name);
// Guard against overflow
uint32_t end_addr = (size <= UINT32_MAX - addr) ? addr + size : UINT32_MAX;
AddFunction(addr, end_addr, name);
count++;
}
}
Expand All @@ -128,9 +159,19 @@ void Profiler::ClearSymbols() {
}

void Profiler::Start(ProfileMode mode) {
ProfileOptions opts;
opts.mode = mode;
opts.sample_rate = 1;
Start(opts);
}

void Profiler::Start(const ProfileOptions& options) {
if (running_) return;

mode_ = mode;
mode_ = options.mode;
sample_rate_ = options.sample_rate > 0 ? options.sample_rate : 1;
sample_counter_ = 0;
pending_cycles_ = 0;
g_active_profiler = this;
set_cpu_hook(ProfilerHook);
running_ = true;
Expand All @@ -153,6 +194,7 @@ void Profiler::Reset() {
}
call_stack_.clear();
total_cycles_ = 0;
pending_cycles_ = 0;
last_pc_ = 0;
if (running_) {
last_cycles_ = m68k.cycles;
Expand Down Expand Up @@ -184,7 +226,7 @@ const FunctionDef* Profiler::LookupFunction(uint32_t addr) const {

uint16_t Profiler::ReadWord(uint32_t addr) const {
// Read from ROM (cart.rom is byteswapped on little-endian)
if (addr < 0x400000 && addr < cart.romsize) {
if (addr < 0x400000 && addr + 1 < cart.romsize) {
#ifdef LSB_FIRST
return (cart.rom[addr ^ 1] << 8) | cart.rom[(addr + 1) ^ 1];
#else
Expand All @@ -204,13 +246,30 @@ void Profiler::OnExecute(uint32_t pc) {

total_cycles_ += delta;

// Sampling: only do expensive work every Nth instruction
if (sample_rate_ > 1) {
pending_cycles_ += delta;
sample_counter_++;
if (sample_counter_ < sample_rate_) {
// Update last_pc_ even on skipped samples so CallStack mode
// can track call/return instructions correctly
last_pc_ = pc;
return;
}
sample_counter_ = 0;
// Use accumulated cycles since last sample (more accurate than scaling)
delta = pending_cycles_;
pending_cycles_ = 0;
}

// Attribute cycles to current function
const FunctionDef* func = LookupFunction(pc);
if (func) {
auto& s = stats_[func->start_addr];
s.cycles_exclusive += delta;

// Count function entry (PC moved into this function from outside)
// Note: With sampling, this undercounts entries that happen between samples
if (last_pc_ != 0) {
const FunctionDef* last_func = LookupFunction(last_pc_);
if (last_func != func) {
Expand All @@ -220,12 +279,17 @@ void Profiler::OnExecute(uint32_t pc) {
}

// CallStack mode: track JSR/BSR/RTS for inclusive cycles
// Note: With sampling enabled, we only check every Nth instruction for
// call/return opcodes, so inclusive timing will be less accurate.
if (mode_ == ProfileMode::CallStack && last_pc_ != 0) {
uint16_t opcode = ReadWord(last_pc_);

if (IsCallOpcode(opcode)) {
// Entering a new function - push frame
if (func) {
// Limit stack depth to prevent unbounded growth from unbalanced calls
// (e.g., indirect jumps, exceptions, or non-standard control flow)
constexpr size_t MAX_CALL_STACK_DEPTH = 256;
if (func && call_stack_.size() < MAX_CALL_STACK_DEPTH) {
call_stack_.push_back({func->start_addr, current_cycles});
}
} else if (IsReturnOpcode(opcode) && !call_stack_.empty()) {
Expand Down Expand Up @@ -279,6 +343,9 @@ void Profiler::PrintReport(std::ostream& out, size_t max_functions) const {
// Print header
bool show_inclusive = (mode_ == ProfileMode::CallStack);
out << "\n";
if (sample_rate_ > 1) {
out << "Sample rate: 1/" << sample_rate_ << " (estimated cycles)\n";
}
out << std::setw(30) << std::left << "Function"
<< std::setw(12) << std::right << "Cycles";
if (show_inclusive) {
Expand Down
Loading