diff --git a/src/include/nanobench.h b/src/include/nanobench.h
index d233dcc..1f798b8 100644
--- a/src/include/nanobench.h
+++ b/src/include/nanobench.h
@@ -137,6 +137,11 @@ class Result;
 class Rng;
 class BigO;
 
+namespace detail {
+template <typename SetupOp>
+class SetupRunner;
+} // namespace detail
+
 /**
  * @brief Renders output from a mustache-like template and benchmark results.
  *
@@ -1007,7 +1012,21 @@ class Bench {
     Bench& config(Config const& benchmarkConfig);
     ANKERL_NANOBENCH(NODISCARD) Config const& config() const noexcept;
 
+    /**
+     * @brief Configure an untimed setup step per epoch (fluent API).
+     *
+     * Example: `bench.setup(...).run(...);`
+     */
+    template <typename SetupOp>
+    detail::SetupRunner<SetupOp> setup(SetupOp setupOp);
+
 private:
+    template <typename SetupOp, typename Op>
+    Bench& runImpl(SetupOp& setupOp, Op&& op);
+
+    template <typename SetupOp>
+    friend class detail::SetupRunner;
+
     Config mConfig{};
     std::vector<Result> mResults{};
 };
@@ -1207,14 +1226,44 @@ constexpr uint64_t Rng::rotl(uint64_t x, unsigned k) noexcept {
     return (x << k) | (x >> (64U - k));
 }
 
+namespace detail {
+
+template <typename SetupOp>
+class SetupRunner {
+public:
+    explicit SetupRunner(SetupOp setupOp, Bench& bench)
+        : mSetupOp(std::move(setupOp))
+        , mBench(bench) {}
+
+    template <typename Op>
+    ANKERL_NANOBENCH_NO_SANITIZE("integer")
+    Bench& run(Op&& op) {
+        return mBench.runImpl(mSetupOp, std::forward<Op>(op));
+    }
+
+private:
+    SetupOp mSetupOp;
+    Bench& mBench;
+};
+} // namespace detail
+
 template <typename Op>
 ANKERL_NANOBENCH_NO_SANITIZE("integer")
 Bench& Bench::run(Op&& op) {
+    auto setupOp = [] {};
+    return runImpl(setupOp, std::forward<Op>(op));
+}
+
+template <typename SetupOp, typename Op>
+ANKERL_NANOBENCH_NO_SANITIZE("integer")
+Bench& Bench::runImpl(SetupOp& setupOp, Op&& op) {
     // It is important that this method is kept short so the compiler can do better optimizations/ inlining of op()
     detail::IterationLogic iterationLogic(*this);
     auto& pc = detail::performanceCounters();
 
     while (auto n = iterationLogic.numIters()) {
+        setupOp();
+
         pc.beginMeasure();
         Clock::time_point const before = Clock::now();
         while (n-- > 0) {
@@ -1229,6 +1278,11 @@ Bench& Bench::run(Op&& op) {
     return *this;
 }
 
+template <typename SetupOp>
+detail::SetupRunner<SetupOp> Bench::setup(SetupOp setupOp) {
+    return detail::SetupRunner<SetupOp>(std::move(setupOp), *this);
+}
+
 // Performs all evaluations.
 template <typename Op>
 Bench& Bench::run(char const* benchmarkName, Op&& op) {
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index d544752..e7c918c 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -32,6 +32,7 @@ target_sources_local(nb PRIVATE
     unit_cold.cpp
     unit_exact_iters_and_epochs.cpp
     unit_romutrio.cpp
+    unit_setup.cpp
     unit_templates.cpp
     unit_timeunit.cpp
     unit_to_s.cpp
diff --git a/src/test/unit_setup.cpp b/src/test/unit_setup.cpp
new file mode 100644
index 0000000..bbc3e71
--- /dev/null
+++ b/src/test/unit_setup.cpp
@@ -0,0 +1,41 @@
+#include <nanobench.h>
+#include <thirdparty/doctest/doctest.h>
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+// NOLINTNEXTLINE
+TEST_CASE("unit_setup_time_is_excluded_from_measurement") {
+    size_t setupCalls = 0;
+    size_t benchCalls = 0;
+
+    ankerl::nanobench::Bench bench;
+    bench.output(nullptr).warmup(0).epochs(3).epochIterations(1).performanceCounters(false);
+    bench
+        .setup([&] {
+            ++setupCalls;
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        })
+        .run([&] { ++benchCalls; });
+
+    REQUIRE(setupCalls == 3);
+    REQUIRE(benchCalls == 3);
+
+    // Setup sleeps 50ms per epoch, but measured time should be near zero
+    double const elapsedS = bench.results().back().median(ankerl::nanobench::Result::Measure::elapsed);
+    REQUIRE(elapsedS < 0.01);
+}
+
+// NOLINTNEXTLINE
+TEST_CASE("unit_setup_runs_before_each_epoch") {
+    std::vector<char> callSequence;
+
+    ankerl::nanobench::Bench bench;
+    bench.output(nullptr).warmup(0).epochs(2).epochIterations(3).performanceCounters(false);
+    bench
+        .setup([&] { callSequence.push_back('S'); })
+        .run([&] { callSequence.push_back('R'); });
+
+    REQUIRE(callSequence == std::vector<char>{'S', 'R', 'R', 'R', 'S', 'R', 'R', 'R'});
+}