diff --git a/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h b/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h
index d4ebe9ea3975..622d9f27554b 100644
--- a/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h
+++ b/mlir/include/mlir/Dialect/Rock/Tuning/RockTuning.h
@@ -52,6 +52,9 @@ struct ParamEntry {
 struct TuningParamSet {
   llvm::SetVector<RockTuningParamAttrInterface> tuningRange;
   KernelType primaryOpType;
+  // The tuning kind that was actually used (may differ from requested kind,
+  // e.g. Greedy falls back to Exhaustive for non-accel).
+  TuningParamSetKind effectiveKind;
 };
 
 struct TuningParamSpaceSettings {
diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
index ca0afea45388..5be7e7f66f2c 100644
--- a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
+++ b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
@@ -965,6 +965,7 @@ createTunableParamSpace(ModuleOp mod, TuningParamSetKind kind,
                         rock::TuningParamSpaceSettings &settings) {
   struct TuningParamSet *newSpace;
   newSpace = new TuningParamSet();
+  newSpace->effectiveKind = kind;
 
   // create range and heuristic
   WalkResult findPrimary =
@@ -974,6 +975,7 @@ createTunableParamSpace(ModuleOp mod, TuningParamSetKind kind,
         // greedy is not implemented for non-accel
         if (!archInfo.isAccel(op) && kind == TuningParamSetKind::Greedy) {
           kind = TuningParamSetKind::Exhaustive;
+          newSpace->effectiveKind = kind;
           llvm::errs() << "Greedy tuning not implemented for non-accel, using "
                           "Exhaustive instead\n";
         }
diff --git a/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h b/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h
index a2833936300e..92d4f69fa4c1 100644
--- a/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h
+++ b/mlir/tools/rocmlir-tuning-driver/ConcurrentQueue.h
@@ -1,4 +1,4 @@
-//===- ConcurrentQueue.h - Simple MPMC queue --------------------*- C++ -*-===//
+//===- ConcurrentQueue.h - Rate-adaptive MPMC queue -------------*- C++ -*-===//
 //
 // Part of the rocMLIR Project, under the Apache License v2.0 with LLVM
 // Exceptions. See https://llvm.org/LICENSE.txt for license information.
@@ -11,7 +11,9 @@
 
 #include "llvm/Support/Compiler.h"
 
+#include <algorithm>
 #include <atomic>
+#include <cassert>
 #include <condition_variable>
 #include <mutex>
 #include <queue>
@@ -21,26 +23,39 @@ namespace rocmlir::tuningdriver {
 template <typename T>
 class ConcurrentQueue {
 public:
+  // If maxCapacity is 0, the queue is unbounded
+  explicit ConcurrentQueue(size_t maxCapacity = 0) : maxCapacity(maxCapacity) {}
+
   template <typename U>
   bool push(U &&item) {
     if (LLVM_UNLIKELY(done.load(std::memory_order_relaxed)))
       return false; // Early exit if terminated
 
     {
-      std::lock_guard<std::mutex> lock(mtx);
+      std::unique_lock<std::mutex> lock(mtx);
+
+      if (maxCapacity > 0) {
+        cvNotFull.wait(lock, [this] {
+          return queue.size() < currentCapacity ||
+                 done.load(std::memory_order_relaxed);
+        });
+      }
+
       if (LLVM_UNLIKELY(done.load(std::memory_order_relaxed)))
-        return false; // Double-check after acquiring the lock
+        return false;
 
       queue.emplace(std::forward<U>(item));
     }
 
-    cv.notify_one();
+    cvNotEmpty.notify_one();
     return true;
   }
 
   bool pop(T &item) {
     std::unique_lock<std::mutex> lock(mtx);
-    cv.wait(lock, [this] {
+
+    bool starved = queue.empty();
+    cvNotEmpty.wait(lock, [this] {
       return !queue.empty() || done.load(std::memory_order_relaxed);
     });
 
@@ -49,20 +64,51 @@ class ConcurrentQueue {
 
     item = std::move(queue.front());
     queue.pop();
+
+    if (maxCapacity > 0) {
+      if (starved) {
+        // If the queue was empty, increase the capacity
+        currentCapacity = std::min(currentCapacity + 1, maxCapacity);
+        consecutiveFed = 0;
+      } else {
+        ++consecutiveFed;
+        if (consecutiveFed >= fedShrinkThreshold) {
+          // Decrease the capacity if the queue has been fed for a while
+          currentCapacity =
+              std::max(currentCapacity / 2, static_cast<size_t>(1));
+          consecutiveFed = 0;
+        }
+      }
+    }
+
+    lock.unlock();
+
+    cvNotFull.notify_one();
     return true;
   }
 
   void terminate() {
-    done.store(true, std::memory_order_relaxed);
-    cv.notify_all();
+    {
+      std::lock_guard<std::mutex> lock(mtx);
+      done.store(true, std::memory_order_relaxed);
+    }
+    cvNotEmpty.notify_all();
+    cvNotFull.notify_all();
   }
 
   bool isTerminated() const { return done.load(std::memory_order_relaxed); }
 
 private:
+  static constexpr size_t fedShrinkThreshold = 4;
+
+  const size_t maxCapacity;
+  size_t currentCapacity{maxCapacity};
+  size_t consecutiveFed{0};
+
   std::queue<T> queue;
   std::mutex mtx;
-  std::condition_variable cv;
+  std::condition_variable cvNotEmpty;
+  std::condition_variable cvNotFull;
   std::atomic<bool> done{false};
 };
 
diff --git a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp
index 23f8506f837b..1048ab72a8e3 100644
--- a/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp
+++ b/mlir/tools/rocmlir-tuning-driver/rocmlir-tuning-driver.cpp
@@ -56,9 +56,6 @@
 #include <mutex>
 #include <thread>
 
-// Utilities to allocate buffers
-#include "../utils/performance/common/benchmarkUtils.h"
-
 #include "CacheFlush.h"
 #include "ConcurrentQueue.h"
 
@@ -183,30 +180,6 @@ static OwningOpRef<ModuleOp> parseMLIRInput(StringRef inputFilename,
   return parseSourceFile<ModuleOp>(sourceMgr, context);
 }
 
-static benchmark::DataType getDataType(Type inputType) {
-  if (inputType.isF32()) {
-    return benchmark::DataType::F32;
-  } else if (inputType.isInteger(32)) {
-    return benchmark::DataType::I32;
-  } else if (inputType.isF16()) {
-    return benchmark::DataType::F16;
-  } else if (inputType.isBF16()) {
-    return benchmark::DataType::BF16;
-  } else if (inputType.isInteger(8)) {
-    return benchmark::DataType::I8;
-  } else if (isa<Float8E4M3FNUZType, Float8E4M3FNType, Float8E5M2Type,
-                 Float8E5M2FNUZType>(inputType)) {
-    return benchmark::DataType::F8;
-  } else if (isa<Float8E8M0FNUType>(inputType)) {
-    return benchmark::DataType::F8E8M0FNU;
-  } else if (isa<Float4E2M1FNType>(inputType)) {
-    return benchmark::DataType::F4;
-  } else {
-    llvm::errs() << "Unknown data type: " << inputType << "\n";
-    llvm_unreachable("Kernels only accept ints or floats");
-  }
-}
-
 // intentionally leaky macro
 #define HIPCHECK(expr)                                                         \
   if (hipSuccess != (expr)) {                                                  \
@@ -431,28 +404,14 @@ measureLargeKernel(unsigned iterations, hipStream_t stream,
 }
 
 // In order to match rocprof, returns time in nanoseconds
-static FailureOr<double>
-benchmarkKernels(ArrayRef<std::string> binaries,
-                 ArrayRef<std::string> funcNames, ArrayRef<uint32_t> blockSizes,
-                 ArrayRef<uint32_t> gridSizes, ArrayRef<void *> hostBuffers,
-                 MutableArrayRef<void *> gpuBuffers,
-                 ArrayRef<size_t> bufferSizes, const BenchmarkParams &params) {
+static FailureOr<double> benchmarkKernels(ArrayRef<std::string> binaries,
+                                          ArrayRef<std::string> funcNames,
+                                          ArrayRef<uint32_t> blockSizes,
+                                          ArrayRef<uint32_t> gridSizes,
+                                          MutableArrayRef<void *> gpuBuffers,
+                                          hipStream_t stream,
+                                          const BenchmarkParams &params) {
   bool benchmarkMode = !params.benchmarkConfig.empty();
-  hipStream_t stream;
-  HIPCHECK(hipStreamCreate(&stream));
-  auto streamCleanup = llvm::make_scope_exit([&]() {
-    hipError_t destroyStatus = hipStreamDestroy(stream);
-    if (destroyStatus != hipSuccess) {
-      llvm::errs() << "HIP error in hipStreamDestroy: "
-                   << hipGetErrorString(destroyStatus) << "\n";
-    }
-  });
-
-  // Initialize device buffers
-  for (size_t i = 0; i < bufferSizes.size(); i++) {
-    HIPCHECK(hipMemcpyAsync(gpuBuffers[i], hostBuffers[i], bufferSizes[i],
-                            hipMemcpyHostToDevice, stream));
-  }
 
   // HIP wants an array of pointers to each argument
   std::vector<void *> argPointers;
@@ -702,12 +661,19 @@ static LogicalResult runTuningLoop(ModuleOp source) {
   backendOpts.optLevel = 3;
   backendOpts.suppressDiagnostic = true;
 
-  // 3. Initialize host buffers and allocate device buffers
-  std::vector<void *> hostBuffers;
+  // 3. Create HIP stream and allocate device buffers
+  hipStream_t stream;
+  HIPCHECK(hipStreamCreate(&stream));
+  auto streamCleanup = llvm::make_scope_exit([&]() {
+    hipError_t status = hipStreamDestroy(stream);
+    if (status != hipSuccess) {
+      llvm::errs() << "HIP error in hipStreamDestroy: "
+                   << hipGetErrorString(status) << "\n";
+    }
+  });
+
   std::vector<void *> gpuBuffers;
   auto bufferCleanup = llvm::make_scope_exit([&]() {
-    for (void *buffer : hostBuffers)
-      free(buffer);
     for (void *buffer : gpuBuffers) {
       // hipFree does not allow nullptrs, so make sure to check for it first
       if (!buffer)
@@ -722,21 +688,17 @@ static LogicalResult runTuningLoop(ModuleOp source) {
       llvm::errs() << "Failed to cleanup cache flush artifacts\n";
     }
   });
-  assert(argTypes.size() == bufferLengths.size() &&
-         "number of arguments and buffer lengths must match");
-  for (auto [argType, bufferLength] : llvm::zip(argTypes, bufferLengths)) {
-    benchmark::DataType type = getDataType(getElementTypeOrSelf(argType));
-    void *hostBuffer = benchmark::allocAndFill(type, bufferLength);
+  for (size_t bufferLength : bufferLengths) {
     void *gpuBuffer = nullptr;
     hipError_t hipStatus = hipMalloc(&gpuBuffer, bufferLength);
     if (hipStatus != hipSuccess) {
-      free(hostBuffer);
-      llvm::errs() << "HIP error in hipMalloc(gpuBuffer): "
-                   << hipGetErrorString(hipStatus) << "\n";
+      llvm::errs() << "HIP error in hipMalloc: " << hipGetErrorString(hipStatus)
+                   << "\n";
       return failure();
     }
-    hostBuffers.push_back(hostBuffer);
     gpuBuffers.push_back(gpuBuffer);
+    // 0x3F decodes as a finite, non-zero value for all floating point types
+    HIPCHECK(hipMemsetAsync(gpuBuffer, 0x3F, bufferLength, stream));
   }
 
   // 4. Multi-iteration tuning loop
@@ -750,13 +712,31 @@ static LogicalResult runTuningLoop(ModuleOp source) {
       sleepUs,           showStats,        showAllMeasurements, tuningSpaceKind,
       numCompileThreads, benchmarkConfig,  waitForCompiles};
 
-  unsigned numTuningIterations =
-      rock::getNumberOfIterations(benchmarkParams.tuningSpaceKind);
+  rock::TuningParamSetKind effectiveKind = benchmarkParams.tuningSpaceKind;
+  unsigned numTuningIterations = rock::getNumberOfIterations(effectiveKind);
   if (!benchmarkParams.benchmarkConfig.empty() && numTuningIterations != 1) {
     llvm::errs() << "benchmarking should do a single tuning iteration\n";
     return failure();
   }
 
+  // Serialize source module once (shared by all threads for parsing).
+  // The source module doesn't change between greedy iterations.
+  std::string sourceModuleStr;
+  {
+    llvm::raw_string_ostream sourceOs(sourceModuleStr);
+    source->print(sourceOs);
+  }
+
+  unsigned maxThreads = (benchmarkParams.numCompileThreads > 0)
+                            ? benchmarkParams.numCompileThreads
+                            : std::thread::hardware_concurrency();
+  if (maxThreads == 0)
+    maxThreads = 4;
+
+  // Thread resources are cached and reused across greedy iterations to avoid
+  // re-creating MLIRContexts, PassManagers, and re-parsing the source module.
+  std::vector<ThreadResources> threadResources;
+
   // Main iteration loop - wraps config generation, compilation, AND
   // benchmarking
   for (unsigned iterIdx = 0; iterIdx < numTuningIterations; ++iterIdx) {
@@ -779,6 +759,11 @@ static LogicalResult runTuningLoop(ModuleOp source) {
         return failure();
       }
 
+      // The tuning space may have fallen back to a different kind (e.g. Greedy
+      // -> Exhaustive for non-accel), so adjust the iteration count.
+      effectiveKind = tuningSpace->effectiveKind;
+      numTuningIterations = rock::getNumberOfIterations(effectiveKind);
+
       for (rock::RockTuningParamAttrInterface tuningAttr :
            tuningSpace->tuningRange) {
         SmallString<64> perfConfig;
@@ -787,55 +772,47 @@ static LogicalResult runTuningLoop(ModuleOp source) {
       }
     }
 
-    // Determine number of parallel threads
-    unsigned numThreads = (benchmarkParams.numCompileThreads > 0)
-                              ? benchmarkParams.numCompileThreads
-                              : std::thread::hardware_concurrency();
-    if (numThreads == 0)
-      numThreads = 4; // fallback
-
-    // Don't create more threads than configs to compile
-    numThreads = std::min(numThreads, static_cast<unsigned>(configs.size()));
-
-    // Serialize source module once (shared by all threads for parsing)
-    std::string sourceModuleStr;
-    {
-      llvm::raw_string_ostream sourceOs(sourceModuleStr);
-      source->print(sourceOs);
-    }
+    unsigned numThreads =
+        std::min(maxThreads, static_cast<unsigned>(configs.size()));
 
     // PHASE 2: Pre-initialize thread resources (contexts, PassManagers, parsed
     // modules). This avoids the expensive per-config overhead of creating
     // contexts, parsing modules, and building pipelines.
+    // On the first iteration all resources are created; subsequent iterations
+    // only grow the pool if more threads are needed.
     // Note: MLIR's MLIRContext cannot be safely shared across parallel pass
     // executions, so each thread needs its own context.
-    std::vector<ThreadResources> threadResources(numThreads);
-    std::atomic<bool> initFailed{false};
-
-    {
-      std::vector<std::thread> initThreads;
-      initThreads.reserve(numThreads);
-      for (unsigned i = 0; i < numThreads; ++i) {
-        initThreads.emplace_back([&, i]() {
-          if (!threadResources[i].initialize(sourceModuleStr, applicabilityOpts,
-                                             compilationKernOpts,
-                                             backendOpts)) {
-            initFailed.store(true, std::memory_order_relaxed);
-          }
-        });
-      }
-      for (auto &t : initThreads) {
-        t.join();
+    if (static_cast<size_t>(numThreads) > threadResources.size()) {
+      unsigned oldSize = threadResources.size();
+      threadResources.resize(numThreads);
+      std::atomic<bool> initFailed{false};
+      {
+        std::vector<std::thread> initThreads;
+        initThreads.reserve(numThreads - oldSize);
+        for (unsigned i = oldSize; i < numThreads; ++i) {
+          initThreads.emplace_back([&, i]() {
+            if (!threadResources[i].initialize(
+                    sourceModuleStr, applicabilityOpts, compilationKernOpts,
+                    backendOpts)) {
+              initFailed.store(true, std::memory_order_relaxed);
+            }
+          });
+        }
+        for (auto &t : initThreads) {
+          t.join();
+        }
       }
-    }
 
-    if (initFailed.load(std::memory_order_relaxed)) {
-      llvm::errs() << "Failed to initialize thread resources\n";
-      return failure();
+      if (initFailed.load(std::memory_order_relaxed)) {
+        llvm::errs() << "Failed to initialize thread resources\n";
+        return failure();
+      }
     }
 
     // PHASE 3: Parallel compilation phase using pre-initialized resources
-    ConcurrentQueue<CompilationResult> compilationResults;
+    // Unbounded queue if we wait for all compilations to avoid blocking
+    size_t maxCapacity = benchmarkParams.waitForCompiles ? 0 : numThreads;
+    ConcurrentQueue<CompilationResult> compilationResults(maxCapacity);
     std::mutex outputMutex; // For thread-safe console output
 
     // Compile a single config using pre-initialized thread resources
@@ -990,10 +967,9 @@ static LogicalResult runTuningLoop(ModuleOp source) {
       assert(result.status == CompilationStatus::Success &&
              "Unexpected compilation status in benchmarking phase");
 
-      FailureOr<double> timing =
-          benchmarkKernels(result.hipModules, kernelFuncNames,
-                           result.blockSizes, result.gridSizes, hostBuffers,
-                           gpuBuffers, bufferLengths, benchmarkParams);
+      FailureOr<double> timing = benchmarkKernels(
+          result.hipModules, kernelFuncNames, result.blockSizes,
+          result.gridSizes, gpuBuffers, stream, benchmarkParams);
 
       if (failed(timing)) {
         llvm::errs() << "Kernel execution failed\n";
@@ -1003,7 +979,7 @@ static LogicalResult runTuningLoop(ModuleOp source) {
 
       validResults++;
       // Find best config
-      if (rock::needToUpdateBest(benchmarkParams.tuningSpaceKind)) {
+      if (rock::needToUpdateBest(effectiveKind)) {
         if (timing.value() < bestTimeOverall) {
           bestTimeOverall = timing.value();
           bestConfigOverall = result.perfConfig;