diff --git a/Cargo.lock b/Cargo.lock
index bc6799a91..2df3d4d32 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1533,6 +1533,16 @@ dependencies = [
  "darling_macro 0.13.4",
 ]
 
+[[package]]
+name = "darling"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
+dependencies = [
+ "darling_core 0.20.11",
+ "darling_macro 0.20.11",
+]
+
 [[package]]
 name = "darling"
 version = "0.23.0"
@@ -1557,6 +1567,20 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "darling_core"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim 0.11.1",
+ "syn 2.0.114",
+]
+
 [[package]]
 name = "darling_core"
 version = "0.23.0"
@@ -1581,6 +1605,17 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "darling_macro"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
+dependencies = [
+ "darling_core 0.20.11",
+ "quote",
+ "syn 2.0.114",
+]
+
 [[package]]
 name = "darling_macro"
 version = "0.23.0"
@@ -3874,6 +3909,29 @@ dependencies = [
  "syn 2.0.114",
 ]
 
+[[package]]
+name = "nvml-wrapper"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c9bff0aa1d48904a1385ea2a8b97576fbdcbc9a3cfccd0d31fe978e1c4038c5"
+dependencies = [
+ "bitflags",
+ "libloading 0.8.9",
+ "nvml-wrapper-sys",
+ "static_assertions",
+ "thiserror 1.0.69",
+ "wrapcenum-derive",
+]
+
+[[package]]
+name = "nvml-wrapper-sys"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "698d45156f28781a4e79652b6ebe2eaa0589057d588d3aec1333f6466f13fcb5"
+dependencies = [
+ "libloading 0.8.9",
+]
+
 [[package]]
 name = "object"
 version = "0.32.2"
@@ -5278,13 +5336,16 @@ dependencies = [
  "arrow-schema",
  "bindgen",
  "cmake",
+ "geo",
  "log",
+ "nvml-wrapper",
  "sedona-expr",
  "sedona-geos",
  "sedona-schema",
  "sedona-testing",
  "thiserror 2.0.17",
  "which",
+ "wkt 0.14.0",
 ]
 
 [[package]]
@@ -5759,6 +5820,12 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
 [[package]]
 name = "strsim"
 version = "0.10.0"
@@ -6742,6 +6809,18 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "wrapcenum-derive"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a76ff259533532054cfbaefb115c613203c73707017459206380f03b3b3f266e"
+dependencies = [
+ "darling 0.20.11",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.114",
+]
+
 [[package]]
 name = "writeable"
 version = "0.6.2"
diff --git a/c/sedona-libgpuspatial/Cargo.toml b/c/sedona-libgpuspatial/Cargo.toml
index f271cd57a..efde2d986 100644
--- a/c/sedona-libgpuspatial/Cargo.toml
+++ b/c/sedona-libgpuspatial/Cargo.toml
@@ -40,8 +40,11 @@ which = "8.0"
 arrow-array = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true }
 thiserror = { workspace = true }
+geo = { workspace = true }
+wkt = { workspace = true }
 log = "0.4"
 sedona-schema = { path = "../../rust/sedona-schema" }
+nvml-wrapper = "0.10.0"
 
 [dev-dependencies]
 sedona-expr = { path = "../../rust/sedona-expr" }
diff --git a/c/sedona-libgpuspatial/build.rs b/c/sedona-libgpuspatial/build.rs
index 6bf5f3f8b..db9f3a48f 100644
--- a/c/sedona-libgpuspatial/build.rs
+++ b/c/sedona-libgpuspatial/build.rs
@@ -119,6 +119,13 @@ fn main() {
                 println!("cargo:warning=CMAKE_CUDA_ARCHITECTURES environment variable not set. Defaulting to '86;89'.");
                 "86;89".to_string()
             });
+        // Determine the build profile to match Cargo's debug/release mode
+        let profile_mode = if cfg!(debug_assertions) {
+            "Debug"
+        } else {
+            "Release"
+        };
+
         let dst = cmake::Config::new("./libgpuspatial")
             .define("CMAKE_CUDA_ARCHITECTURES", cuda_architectures)
             .define("CMAKE_POLICY_VERSION_MINIMUM", "3.5") // Allow older CMake versions
@@ -157,6 +164,17 @@ fn main() {
         println!("cargo:rustc-link-lib=static=gpuspatial");
         println!("cargo:rustc-link-lib=static=rmm");
         println!("cargo:rustc-link-lib=static=rapids_logger");
+        // Use the 'd' suffix for the debug build of spdlog (libspdlogd.a)
+        let spdlog_lib_name = if cfg!(debug_assertions) {
+            "spdlogd"
+        } else {
+            "spdlog"
+        };
+        println!(
+            "cargo:warning=Linking spdlog in {} mode: lib{}.a",
+            profile_mode, spdlog_lib_name
+        );
+        println!("cargo:rustc-link-lib=static={}", spdlog_lib_name);
         println!("cargo:rustc-link-lib=static=geoarrow");
         println!("cargo:rustc-link-lib=static=nanoarrow");
         println!("cargo:rustc-link-lib=stdc++");
diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
index 773cf2061..eab272481 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
@@ -132,8 +132,13 @@ config_shaders(PTX_FILES)
 
 message("-- Config shader PTX files ${PTX_FILES}")
 
-add_library(gpuspatial src/rt/rt_engine.cpp src/relate_engine.cu src/spatial_joiner.cu
-                       ${PTX_FILES})
+add_library(gpuspatial
+            src/rt/rt_engine.cpp
+            src/memory_manager.cc
+            src/relate_engine.cu
+            src/rt_spatial_index.cu
+            src/rt_spatial_refiner.cu
+            ${PTX_FILES})
 
 # Link libraries
 target_link_libraries(gpuspatial
@@ -142,8 +147,7 @@ target_link_libraries(gpuspatial
                              cuda
                              rmm::rmm
                              rapids_logger::rapids_logger
-                             OptiX
-                      PRIVATE zstd)
+                             OptiX)
 
 # Set include directories
 target_include_directories(gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json b/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
index 55248ea7f..0cb8a7fbb 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
@@ -31,7 +31,7 @@
             "name": "default",
             "configurePreset": "default-with-tests",
             "environment": {
-                "GPUSPATIAL_TEST_DIR": "${sourceDir}/test_data"
+                "GPUSPATIAL_TEST_DIR": "${sourceDir}/test/data"
             }
         }
     ]
diff --git a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
index 1f4d53c22..a7314c151 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
@@ -47,6 +47,7 @@ function(find_and_configure_geoarrow)
                   "BUILD_SHARED_LIBS OFF"
                   ${_exclude_from_all})
   set_target_properties(geoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  target_compile_options(geoarrow PRIVATE -Wno-conversion)
   rapids_export_find_package_root(BUILD
                                   geoarrow
                                   "${geoarrow_BINARY_DIR}"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
index ecc3b4179..61932beb6 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
@@ -48,6 +48,10 @@ function(find_and_configure_nanoarrow)
                   "NANOARROW_NAMESPACE gpuspatial"
                   ${_exclude_from_all})
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  if(TARGET nanoarrow_ipc) # Tests need this
+    target_compile_options(nanoarrow_ipc PRIVATE -Wno-conversion)
+  endif()
+  target_compile_options(nanoarrow PRIVATE -Wno-conversion)
   rapids_export_find_package_root(BUILD
                                   nanoarrow
                                   "${nanoarrow_BINARY_DIR}"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.hpp
similarity index 89%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.hpp
index 9fb33fa8e..971f3565d 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.hpp
@@ -16,9 +16,9 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/helpers.h"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/helpers.cuh"
 
 #include <optix_types.h>
 
@@ -86,22 +86,26 @@ class Box {
   }
 
   DEV_HOST_INLINE OptixAabb ToOptixAabb() const {
-    OptixAabb aabb;
+    OptixAabb aabb{0, 0, 0, 0, 0, 0};
 
-    memset(&aabb, 0, sizeof(OptixAabb));
-    if (sizeof(scalar_t) == sizeof(float)) {
+    if constexpr (sizeof(scalar_t) == sizeof(float)) {
       for (int dim = 0; dim < n_dim; dim++) {
-        reinterpret_cast<float*>(&aabb.minX)[dim] = min_.get_coordinate(dim);
-        reinterpret_cast<float*>(&aabb.maxX)[dim] = max_.get_coordinate(dim);
+        auto min_val = min_.get_coordinate(dim);
+        auto max_val = max_.get_coordinate(dim);
+        if (min_val == max_val) {
+          min_val = next_float_from_double(min_val, -1, 2);
+          max_val = next_float_from_double(max_val, 1, 2);
+        }
+        (&aabb.minX)[dim] = min_val;
+        (&aabb.maxX)[dim] = max_val;
       }
     } else {
       for (int dim = 0; dim < n_dim; dim++) {
         auto min_val = min_.get_coordinate(dim);
         auto max_val = max_.get_coordinate(dim);
 
-        reinterpret_cast<float*>(&aabb.minX)[dim] =
-            next_float_from_double(min_val, -1, 2);
-        reinterpret_cast<float*>(&aabb.maxX)[dim] = next_float_from_double(max_val, 1, 2);
+        (&aabb.minX)[dim] = next_float_from_double(min_val, -1, 2);
+        (&aabb.maxX)[dim] = next_float_from_double(max_val, 1, 2);
       }
     }
     return aabb;
@@ -137,6 +141,8 @@ class Box {
 
   DEV_HOST_INLINE scalar_t get_min(int dim) const { return min_.get_coordinate(dim); }
 
+  DEV_HOST_INLINE bool valid() const { return !min_.empty() && !max_.empty(); }
+
   DEV_HOST_INLINE const point_t& get_max() const { return max_; }
 
   DEV_HOST_INLINE scalar_t get_max(int dim) const { return max_.get_coordinate(dim); }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.hpp
index 433317190..66c7dee45 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_collection.hpp
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/geometry_type.cuh"
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/geom/multi_line_string.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/utils/array_view.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/geometry_type.hpp"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/geom/multi_line_string.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/utils/array_view.hpp"
 
 namespace gpuspatial {
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/geometry_type.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.hpp
index 75f83f38e..a4eef0707 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_segment.hpp
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/floating_point.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/floating_point.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.hpp
index e0ddabe8e..00b57b0d9 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/line_string.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/line_segment.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/line_segment.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.hpp
index b6aae39f8..c5d84f1b6 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_line_string.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T, typename INDEX_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.hpp
index e01938e75..e6bc5a226 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_point.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 namespace gpuspatial {
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.hpp
index b1a443aec..9179789c6 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/multi_polygon.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/polygon.cuh"
+#include "gpuspatial/geom/polygon.hpp"
 
 namespace gpuspatial {
 template <typename POINT_T, typename INDEX_T>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.hpp
similarity index 94%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.hpp
index 500d9def5..006da8d4b 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.hpp
@@ -15,11 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/floating_point.h"
-#include "gpuspatial/utils/type_traits.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/floating_point.hpp"
+#include "gpuspatial/utils/type_traits.hpp"
 
 namespace gpuspatial {
 enum class PointLocation {
@@ -73,7 +73,14 @@ class Point {
 
   DEV_HOST_INLINE const scalar_t* get_data() const { return &data_.x; }
 
-  DEV_HOST_INLINE bool empty() const { return std::isnan(data_.x); }
+  DEV_HOST_INLINE bool empty() const {
+    for (int dim = 0; dim < n_dim; dim++) {
+      if (std::isnan(get_coordinate(dim))) {
+        return true;
+      }
+    }
+    return false;
+  }
 
   DEV_HOST_INLINE void set_empty() {
     for (int dim = 0; dim < n_dim; dim++) {
@@ -102,11 +109,7 @@ class Point {
    * @brief Provides const access to the x-coordinate.
    * This method is only available if N_DIM >= 1.
    */
-  DEV_HOST_INLINE const scalar_t& x() const {
-    if constexpr (N_DIM >= 1) {
-      return data_.x;
-    }
-  }
+  DEV_HOST_INLINE const scalar_t& x() const { return data_.x; }
 
   /**
    * @brief Provides access to the y-coordinate.
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
index 6ed66f168..e457a8fb2 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/polygon.hpp
@@ -16,11 +16,11 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/floating_point.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/floating_point.hpp"
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/warp/warp_reduce.cuh>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.hpp
index 12963b845..b25a0ad9a 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/ray_crossing_counter.hpp
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/doubledouble.h"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/doubledouble.hpp"
 
 namespace gpuspatial {
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
index b31af58b0..994310fed 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
@@ -14,60 +14,162 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#include <stdbool.h>
 #include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct GpuSpatialJoinerConfig {
-  uint32_t concurrency;
+struct ArrowSchema;
+struct ArrowArray;
+
+// Interfaces for ray-tracing engine (OptiX)
+struct GpuSpatialRuntimeConfig {
+  /** Path to PTX files */
   const char* ptx_root;
+  /** Device ID to use, 0 is the first GPU */
+  int device_id;
+  /** Whether to use CUDA memory pool for allocations */
+  bool use_cuda_memory_pool;
+  /** Ratio of initial memory pool size to total GPU memory, between 0 and 100 */
+  int cuda_memory_pool_init_precent;
 };
 
-struct GpuSpatialJoinerContext {
-  const char* last_error;  // Pointer to std::string to store last error message
-  void* private_data;      // GPUSpatial context
-  void* build_indices;     // Pointer to std::vector<uint32_t> to store results
-  void* stream_indices;
+/** Opaque runtime for GPU spatial operations
+ * Each process should only has one instance of GpuSpatialRuntimeexactly
+ *
+ */
+struct GpuSpatialRuntime {
+  /** Initialize the runtime (OptiX) with the given configuration
+   * @return 0 on success, non-zero on failure
+   */
+  int (*init)(struct GpuSpatialRuntime* self, struct GpuSpatialRuntimeConfig* config);
+  void (*release)(struct GpuSpatialRuntime* self);
+  const char* (*get_last_error)(struct GpuSpatialRuntime* self);
+  void* private_data;
 };
 
-enum GpuSpatialPredicate {
-  GpuSpatialPredicateEquals = 0,
-  GpuSpatialPredicateDisjoint,
-  GpuSpatialPredicateTouches,
-  GpuSpatialPredicateContains,
-  GpuSpatialPredicateCovers,
-  GpuSpatialPredicateIntersects,
-  GpuSpatialPredicateWithin,
-  GpuSpatialPredicateCoveredBy
+/** Create an instance of GpuSpatialRuntime */
+void GpuSpatialRuntimeCreate(struct GpuSpatialRuntime* runtime);
+
+struct GpuSpatialIndexConfig {
+  /** Pointer to an initialized GpuSpatialRuntime struct */
+  struct GpuSpatialRuntime* runtime;
+  /** How many threads will concurrently call Probe method */
+  uint32_t concurrency;
+};
+
+// An opaque context for concurrent probing
+struct SedonaSpatialIndexContext {
+  void* private_data;
+};
+
+struct SedonaFloatIndex2D {
+  /** Clear the spatial index, removing all built data */
+  int (*clear)(struct SedonaFloatIndex2D* self);
+  /** Create a new context for concurrent probing */
+  void (*create_context)(struct SedonaSpatialIndexContext* context);
+  /** Destroy a previously created context */
+  void (*destroy_context)(struct SedonaSpatialIndexContext* context);
+  /** Push rectangles for building the spatial index, each rectangle is represented by 4
+   * floats: [min_x, min_y, max_x, max_y] Points can also be indexed by providing [x, y,
+   * x, y] but points and rectangles cannot be mixed
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*push_build)(struct SedonaFloatIndex2D* self, const float* buf, uint32_t n_rects);
+  /**
+   * Finish building the spatial index after all rectangles have been pushed
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*finish_building)(struct SedonaFloatIndex2D* self);
+  /**
+   * Probe the spatial index with the given rectangles, each rectangle is represented by 4
+   * floats: [min_x, min_y, max_x, max_y] Points can also be probed by providing [x, y, x,
+   * y] but points and rectangles cannot be mixed in one Probe call. The results of the
+   * probe will be stored in the context.
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*probe)(struct SedonaFloatIndex2D* self, struct SedonaSpatialIndexContext* context,
+               const float* buf, uint32_t n_rects);
+  /** Get the build indices buffer from the context
+   *
+   * @return A pointer to the buffer and its length
+   */
+  void (*get_build_indices_buffer)(struct SedonaSpatialIndexContext* context,
+                                   uint32_t** build_indices,
+                                   uint32_t* build_indices_length);
+  /** Get the probe indices buffer from the context
+   *
+   * @return A pointer to the buffer and its length
+   */
+  void (*get_probe_indices_buffer)(struct SedonaSpatialIndexContext* context,
+                                   uint32_t** probe_indices,
+                                   uint32_t* probe_indices_length);
+  const char* (*get_last_error)(struct SedonaFloatIndex2D* self);
+  const char* (*context_get_last_error)(struct SedonaSpatialIndexContext* context);
+  /** Release the spatial index and free all resources */
+  void (*release)(struct SedonaFloatIndex2D* self);
+  void* private_data;
 };
 
-struct GpuSpatialJoiner {
-  int (*init)(struct GpuSpatialJoiner* self, struct GpuSpatialJoinerConfig* config);
-  void (*clear)(struct GpuSpatialJoiner* self);
-  void (*create_context)(struct GpuSpatialJoiner* self,
-                         struct GpuSpatialJoinerContext* context);
-  void (*destroy_context)(struct GpuSpatialJoinerContext* context);
-  int (*push_build)(struct GpuSpatialJoiner* self, const struct ArrowSchema* schema,
-                    const struct ArrowArray* array, int64_t offset, int64_t length);
-  int (*finish_building)(struct GpuSpatialJoiner* self);
-  int (*push_stream)(struct GpuSpatialJoiner* self,
-                     struct GpuSpatialJoinerContext* context,
-                     const struct ArrowSchema* schema, const struct ArrowArray* array,
-                     int64_t offset, int64_t length, enum GpuSpatialPredicate predicate,
-                     int32_t array_index_offset);
-  void (*get_build_indices_buffer)(struct GpuSpatialJoinerContext* context,
-                                   void** build_indices, uint32_t* build_indices_length);
-  void (*get_stream_indices_buffer)(struct GpuSpatialJoinerContext* context,
-                                    void** stream_indices,
-                                    uint32_t* stream_indices_length);
-  void (*release)(struct GpuSpatialJoiner* self);
+int GpuSpatialIndexFloat2DCreate(struct SedonaFloatIndex2D* index,
+                                 const struct GpuSpatialIndexConfig* config);
+
+struct GpuSpatialRefinerConfig {
+  /** Pointer to an initialized GpuSpatialRuntime struct */
+  struct GpuSpatialRuntime* runtime;
+  /** How many threads will concurrently call Probe method */
+  uint32_t concurrency;
+  /** Whether to compress the BVH structures to save memory */
+  bool compress_bvh;
+  /** Number of batches to pipeline for parsing and refinement; setting to 1 disables
+   * pipelining */
+  uint32_t pipeline_batches;
+};
+
+enum SedonaSpatialRelationPredicate {
+  SedonaSpatialPredicateEquals = 0,
+  SedonaSpatialPredicateDisjoint,
+  SedonaSpatialPredicateTouches,
+  SedonaSpatialPredicateContains,
+  SedonaSpatialPredicateCovers,
+  SedonaSpatialPredicateIntersects,
+  SedonaSpatialPredicateWithin,
+  SedonaSpatialPredicateCoveredBy
+};
+
+struct SedonaSpatialRefiner {
+  int (*clear)(struct SedonaSpatialRefiner* self);
+
+  int (*push_build)(struct SedonaSpatialRefiner* self,
+                    const struct ArrowSchema* build_schema,
+                    const struct ArrowArray* build_array);
+
+  int (*finish_building)(struct SedonaSpatialRefiner* self);
+
+  int (*refine_loaded)(struct SedonaSpatialRefiner* self,
+                       const struct ArrowSchema* probe_schema,
+                       const struct ArrowArray* probe_array,
+                       enum SedonaSpatialRelationPredicate predicate,
+                       uint32_t* build_indices, uint32_t* probe_indices,
+                       uint32_t indices_size, uint32_t* new_indices_size);
+
+  int (*refine)(struct SedonaSpatialRefiner* self, const struct ArrowSchema* schema1,
+                const struct ArrowArray* array1, const struct ArrowSchema* schema2,
+                const struct ArrowArray* array2,
+                enum SedonaSpatialRelationPredicate predicate, uint32_t* indices1,
+                uint32_t* indices2, uint32_t indices_size, uint32_t* new_indices_size);
+  const char* (*get_last_error)(struct SedonaSpatialRefiner* self);
+  void (*release)(struct SedonaSpatialRefiner* self);
   void* private_data;
-  const char* last_error;
 };
 
-void GpuSpatialJoinerCreate(struct GpuSpatialJoiner* index);
+int GpuSpatialRefinerCreate(struct SedonaSpatialRefiner* refiner,
+                            const struct GpuSpatialRefinerConfig* config);
 #ifdef __cplusplus
 }
 #endif
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp
deleted file mode 100644
index 5dab852d1..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp
+++ /dev/null
@@ -1,294 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/utils/launcher.h"
-#include "gpuspatial/utils/morton_code.h"
-
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-#include "rmm/exec_policy.hpp"
-
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-
-#include <memory>
-
-namespace gpuspatial {
-template <typename POINT_T, typename INDEX_T>
-class GeometryGrouper {
-  using box_t = Box<POINT_T>;
-  static constexpr int n_dim = POINT_T::n_dim;
-  using scalar_t = typename POINT_T::scalar_t;
-
- public:
-  void Group(const rmm::cuda_stream_view& stream,
-             const DeviceGeometries<POINT_T, INDEX_T>& geometries,
-             uint32_t geoms_per_aabb) {
-    switch (geometries.get_geometry_type()) {
-      case GeometryType::kPoint: {
-        Group(
-            stream,
-            geometries.template GetGeometryArrayView<PointArrayView<POINT_T, INDEX_T>>(),
-            geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiPoint: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<MultiPointArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kLineString: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<LineStringArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiLineString: {
-        Group(stream,
-              geometries.template GetGeometryArrayView<
-                  MultiLineStringArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kPolygon: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<PolygonArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiPolygon: {
-        Group(
-            stream,
-            geometries
-                .template GetGeometryArrayView<MultiPolygonArrayView<POINT_T, INDEX_T>>(),
-            geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kBox: {
-        Group(stream,
-              geometries.template GetGeometryArrayView<BoxArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      default:
-        assert(false);
-    }
-  }
-
-  template <typename GEOMETRY_ARRAY_T>
-  void Group(const rmm::cuda_stream_view& stream, const GEOMETRY_ARRAY_T& geometries,
-             uint32_t geoms_per_aabb) {
-    rmm::device_uvector<INDEX_T> morton_codes(geometries.size(), stream);
-    POINT_T min_world_corner, max_world_corner;
-
-    min_world_corner.set_max();
-    max_world_corner.set_min();
-
-    for (int dim = 0; dim < n_dim; dim++) {
-      auto min_val = thrust::transform_reduce(
-          rmm::exec_policy_nosync(stream), thrust::make_counting_iterator<INDEX_T>(0),
-          thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-          [=] __host__ __device__(INDEX_T i) {
-            const auto& geom = geometries[i];
-            const auto& mbr = geom.get_mbr();
-
-            return mbr.get_min(dim);
-          },
-          std::numeric_limits<scalar_t>::max(), thrust::minimum<scalar_t>());
-
-      auto max_val = thrust::transform_reduce(
-          rmm::exec_policy_nosync(stream), thrust::make_counting_iterator<INDEX_T>(0),
-          thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-          [=] __host__ __device__(INDEX_T i) {
-            const auto& geom = geometries[i];
-            const auto& mbr = geom.get_mbr();
-
-            return mbr.get_max(dim);
-          },
-          std::numeric_limits<scalar_t>::lowest(), thrust::maximum<scalar_t>());
-      min_world_corner.set_coordinate(dim, min_val);
-      max_world_corner.set_coordinate(dim, max_val);
-    }
-
-    // compute morton codes and reorder indices
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<INDEX_T>(0),
-                      thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-                      morton_codes.begin(), [=] __device__(INDEX_T i) {
-                        const auto& geom = geometries[i];
-                        const auto& mbr = geom.get_mbr();
-                        auto p = mbr.centroid();
-                        POINT_T norm_p;
-
-                        for (int dim = 0; dim < n_dim; dim++) {
-                          auto min_val = min_world_corner.get_coordinate(dim);
-                          auto max_val = max_world_corner.get_coordinate(dim);
-                          auto extent = min_val == max_val ? 1 : max_val - min_val;
-                          auto norm_val = (p.get_coordinate(dim) - min_val) / extent;
-                          norm_p.set_coordinate(dim, norm_val);
-                        }
-                        return detail::morton_code(norm_p.get_vec());
-                      });
-    reordered_indices_ =
-        std::make_unique<rmm::device_uvector<INDEX_T>>(geometries.size(), stream);
-    thrust::sequence(rmm::exec_policy_nosync(stream), reordered_indices_->begin(),
-                     reordered_indices_->end());
-    thrust::sort_by_key(rmm::exec_policy_nosync(stream), morton_codes.begin(),
-                        morton_codes.end(), reordered_indices_->begin());
-
-    auto n_aabbs = (geometries.size() + geoms_per_aabb - 1) / geoms_per_aabb;
-    aabbs_ = std::make_unique<rmm::device_uvector<OptixAabb>>(n_aabbs, stream);
-    OptixAabb empty_aabb;
-
-    if (n_dim == 2) {
-      empty_aabb = OptixAabb{
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::max(),    0,
-          std::numeric_limits<float>::lowest(), std::numeric_limits<float>::lowest(), 0};
-    } else if (n_dim == 3) {
-      empty_aabb = OptixAabb{
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::max(),
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::lowest(),
-          std::numeric_limits<float>::lowest(), std::numeric_limits<float>::lowest()};
-    }
-
-    thrust::fill(rmm::exec_policy_nosync(stream), aabbs_->begin(), aabbs_->end(),
-                 empty_aabb);
-
-    auto* p_aabbs = aabbs_->data();
-
-    rmm::device_uvector<INDEX_T> n_geoms_per_aabb(n_aabbs, stream);
-
-    auto* p_reordered_indices = reordered_indices_->data();
-    auto* p_n_geoms_per_aabb = n_geoms_per_aabb.data();
-
-    // each warp takes an AABB and processes points_per_aabb points
-    LaunchKernel(stream, [=] __device__() mutable {
-      typedef cub::WarpReduce<scalar_t> WarpReduce;
-      __shared__ typename WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-      auto warp_id = threadIdx.x / 32;
-      auto lane_id = threadIdx.x % 32;
-      auto global_warp_id = TID_1D / 32;
-      auto n_warps = TOTAL_THREADS_1D / 32;
-
-      for (uint32_t aabb_id = global_warp_id; aabb_id < n_aabbs; aabb_id += n_warps) {
-        POINT_T min_corner, max_corner;
-        size_t idx_begin = aabb_id * geoms_per_aabb;
-        size_t idx_end = std::min((size_t)geometries.size(), idx_begin + geoms_per_aabb);
-        size_t idx_end_rup = (idx_end + 31) / 32;
-        idx_end_rup *= 32;  // round up to the next multiple of 32
-
-        p_n_geoms_per_aabb[aabb_id] = idx_end - idx_begin;
-
-        for (auto idx = idx_begin + lane_id; idx < idx_end_rup; idx += 32) {
-          Box<Point<float, POINT_T::n_dim>> mbr;
-
-          auto warp_begin = idx - lane_id;
-          auto warp_end = std::min(warp_begin + 32, idx_end);
-          auto n_valid = warp_end - warp_begin;
-
-          if (idx < idx_end) {
-            auto geom_idx = p_reordered_indices[idx];
-            mbr = geometries[geom_idx].get_mbr();
-          }
-
-          for (int dim = 0; dim < n_dim; dim++) {
-            auto min_val =
-                WarpReduce(temp_storage[warp_id])
-                    .Reduce(mbr.get_min(dim), thrust::minimum<scalar_t>(), n_valid);
-            if (lane_id == 0) {
-              min_corner.set_coordinate(dim, min_val);
-            }
-            auto max_val =
-                WarpReduce(temp_storage[warp_id])
-                    .Reduce(mbr.get_max(dim), thrust::maximum<scalar_t>(), n_valid);
-            if (lane_id == 0) {
-              max_corner.set_coordinate(dim, max_val);
-            }
-          }
-        }
-
-        if (lane_id == 0) {
-          box_t ext_mbr(min_corner, max_corner);
-          p_aabbs[aabb_id] = ext_mbr.ToOptixAabb();
-        }
-      }
-    });
-
-    prefix_sum_ = std::make_unique<rmm::device_uvector<INDEX_T>>(n_aabbs + 1, stream);
-    prefix_sum_->set_element_to_zero_async(0, stream);
-    thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_geoms_per_aabb.begin(),
-                           n_geoms_per_aabb.end(), prefix_sum_->begin() + 1);
-#ifndef NDEBUG
-    auto* p_prefix_sum = prefix_sum_->data();
-
-    thrust::for_each(rmm::exec_policy_nosync(stream),
-                     thrust::counting_iterator<size_t>(0),
-                     thrust::counting_iterator<size_t>(aabbs_->size()),
-                     [=] __device__(size_t aabb_idx) {
-                       auto begin = p_prefix_sum[aabb_idx];
-                       auto end = p_prefix_sum[aabb_idx + 1];
-                       const auto& aabb = p_aabbs[aabb_idx];
-
-                       for (auto i = begin; i < end; i++) {
-                         auto geom_idx = p_reordered_indices[i];
-                         auto mbr = geometries[geom_idx].get_mbr();
-                         assert(mbr.covered_by(aabb));
-                       }
-                     });
-#endif
-  }
-
-  ArrayView<OptixAabb> get_aabbs() const {
-    if (aabbs_ != nullptr) {
-      return ArrayView<OptixAabb>(aabbs_->data(), aabbs_->size());
-    }
-    return {};
-  }
-
-  ArrayView<INDEX_T> get_prefix_sum() const {
-    if (prefix_sum_ != nullptr) {
-      return ArrayView<INDEX_T>(prefix_sum_->data(), prefix_sum_->size());
-    }
-    return {};
-  }
-
-  ArrayView<INDEX_T> get_reordered_indices() const {
-    if (reordered_indices_ != nullptr) {
-      return ArrayView<INDEX_T>(reordered_indices_->data(), reordered_indices_->size());
-    }
-    return {};
-  }
-
-  void Clear() {
-    aabbs_ = nullptr;
-    prefix_sum_ = nullptr;
-    reordered_indices_ = nullptr;
-  }
-
- private:
-  std::unique_ptr<rmm::device_uvector<OptixAabb>> aabbs_;
-  std::unique_ptr<rmm::device_uvector<INDEX_T>> prefix_sum_;
-  std::unique_ptr<rmm::device_uvector<INDEX_T>> reordered_indices_;
-};
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp
deleted file mode 100644
index d0ab3e1ff..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <vector>
-
-namespace gpuspatial {
-// Forward declaration of ObjectPool to be used in the custom deleter.
-template <typename T>
-class ObjectPool;
-
-// A helper struct to allow std::make_shared to access the private constructor.
-// It inherits from ObjectPool and is defined outside of it.
-template <typename T>
-struct PoolEnabler : public ObjectPool<T> {
-  PoolEnabler(size_t size) : ObjectPool<T>(size) {}
-};
-
-// A custom deleter for std::shared_ptr.
-// When the shared_ptr's reference count goes to zero, this deleter
-// will be invoked, returning the object to the pool instead of deleting it.
-template <typename T>
-class PoolDeleter {
- public:
-  // Constructor takes a weak_ptr to the pool to avoid circular references.
-  PoolDeleter(std::weak_ptr<ObjectPool<T>> pool) : pool_(pool) {}
-
-  // The function call operator is what std::shared_ptr invokes.
-  void operator()(T* ptr) const {
-    // Attempt to lock the weak_ptr to get a shared_ptr to the pool.
-    if (auto pool_sp = pool_.lock()) {
-      // If the pool still exists, return the object to it.
-      pool_sp->release(ptr);
-    } else {
-      // If the pool no longer exists, we must delete the pointer to avoid a memory leak.
-      delete ptr;
-    }
-  }
-
- private:
-  std::weak_ptr<ObjectPool<T>> pool_;
-};
-
-/**
- * @brief A thread-safe object pool for reusable objects.
- *
- * @tparam T The type of object to pool.
- */
-template <typename T>
-class ObjectPool : public std::enable_shared_from_this<ObjectPool<T>> {
-  friend struct PoolEnabler<T>;
-
-  // Constructor is private to force object creation through the static 'create' method.
-  // This ensures the ObjectPool is always managed by a std::shared_ptr.
-  ObjectPool(size_t initial_size = 0) {
-    for (size_t i = 0; i < initial_size; ++i) {
-      pool_.push_back(new T());
-    }
-  }
-
- public:
-  /**
-   * @brief Factory method to create an instance of the ObjectPool.
-   * Guarantees that the pool is managed by a std::shared_ptr, which is required
-   * for the custom deleter mechanism to work correctly.
-   *
-   * @param initial_size The number of objects to pre-allocate.
-   * @return A std::shared_ptr to the new ObjectPool instance.
-   */
-  static std::shared_ptr<ObjectPool<T>> create(size_t initial_size = 0) {
-    return std::make_shared<PoolEnabler<T>>(initial_size);
-  }
-
-  /**
-   * @brief Destructor. Cleans up any remaining objects in the pool.
-   */
-  ~ObjectPool() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    for (T* item : pool_) {
-      delete item;
-    }
-    pool_.clear();
-  }
-
-  // Disable copy constructor and assignment operator
-  ObjectPool(const ObjectPool&) = delete;
-  ObjectPool& operator=(const ObjectPool&) = delete;
-
-  /**
-   * @brief Acquires an object from the pool.
-   *
-   * If the pool is empty, a new object is created. The returned shared_ptr
-   * has a custom deleter that will return the object to the pool when it's
-   * no longer referenced.
-   *
-   * @return A std::shared_ptr to an object of type T.
-   */
-  std::shared_ptr<T> take() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    T* resource_ptr = nullptr;
-    if (!pool_.empty()) {
-      // Take an existing object from the pool
-      resource_ptr = pool_.back();
-      pool_.pop_back();
-    } else {
-      // Pool is empty, create a new object
-      resource_ptr = new T();
-    }
-
-    // Create a custom deleter that knows how to return the object to this pool.
-    // this->shared_from_this() is now safe because creation is forced through the
-    // 'create' method.
-    PoolDeleter<T> deleter(this->shared_from_this());
-
-    // Return a shared_ptr with the custom deleter.
-    return std::shared_ptr<T>(resource_ptr, deleter);
-  }
-
-  /**
-   * @brief Returns an object to the pool.
-   *
-   * This method is intended to be called by the PoolDeleter, not directly by clients.
-   *
-   * @param object The raw pointer to the object to return to the pool.
-   */
-  void release(T* object) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    pool_.push_back(object);
-  }
-
-  /**
-   * @brief Gets the current number of available objects in the pool.
-   * @return The size of the pool.
-   */
-  size_t size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return pool_.size();
-  }
-
- private:
-  std::vector<T*> pool_;
-  std::mutex mutex_;
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
new file mode 100644
index 000000000..baaeb77f6
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/gpu_timer.hpp"
+#include "gpuspatial/utils/queue.hpp"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_uvector.hpp"
+#define GPUSPATIAL_PROFILING
+namespace gpuspatial {
+
+/** * @brief A spatial index implementation using NVIDIA OptiX ray tracing engine.
+ *
+ * This class provides spatial indexing capabilities for geometric data using
+ * the OptiX ray tracing engine. It supports building the index from either
+ * points or bounding boxes and allows for efficient spatial queries.
+ *
+ * @tparam SCALAR_T The scalar type used for coordinates (e.g., float, double).
+ * @tparam N_DIM The number of dimensions (e.g., 2 for 2D, 3 for 3D).
+ */
+template <typename SCALAR_T, int N_DIM>
+class RTSpatialIndex : public SpatialIndex<SCALAR_T, N_DIM> {
+  using point_t = typename SpatialIndex<SCALAR_T, N_DIM>::point_t;
+  using box_t = typename SpatialIndex<SCALAR_T, N_DIM>::box_t;
+  using scalar_t = typename point_t::scalar_t;
+  static constexpr int n_dim = point_t::n_dim;
+
+  using index_t = uint32_t;  // type of the index to represent geometries
+  struct SpatialIndexContext {
+    rmm::cuda_stream_view stream;
+    std::string shader_id;
+    rmm::device_buffer bvh_buffer{0, rmm::cuda_stream_default};
+    OptixTraversableHandle handle;
+    std::vector<char> h_launch_params_buffer;
+    rmm::device_buffer launch_params_buffer{0, rmm::cuda_stream_default};
+    std::unique_ptr<rmm::device_scalar<uint32_t>> counter;
+    // output
+    Queue<index_t> build_indices;
+    rmm::device_uvector<index_t> probe_indices{0, rmm::cuda_stream_default};
+#ifdef GPUSPATIAL_PROFILING
+    GPUTimer timer;
+    // counters
+    double alloc_ms = 0.0;
+    double bvh_build_ms = 0.0;
+    double rt_ms = 0.0;
+    double copy_res_ms = 0.0;
+#endif
+  };
+
+ public:
+  RTSpatialIndex() = default;
+
+  RTSpatialIndex(const RTSpatialIndexConfig& config);
+
+  void Clear() override;
+
+  void PushBuild(const box_t* rects, uint32_t n_rects) override;
+
+  void FinishBuilding() override;
+
+  void Probe(const box_t* rects, uint32_t n_rects, std::vector<uint32_t>* build_indices,
+             std::vector<uint32_t>* probe_indices) override;
+
+ private:
+  RTSpatialIndexConfig config_;
+  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
+  bool indexing_points_;
+  // The rectangles being indexed or the MBRs of grouped points
+  rmm::device_uvector<box_t> rects_{0, rmm::cuda_stream_default};
+  // Data structures for indexing points
+  rmm::device_uvector<index_t> point_ranges_{0, rmm::cuda_stream_default};
+  rmm::device_uvector<index_t> reordered_point_indices_{0, rmm::cuda_stream_default};
+  rmm::device_uvector<point_t> points_{0, rmm::cuda_stream_default};
+  rmm::device_buffer bvh_buffer_{0, rmm::cuda_stream_default};
+  OptixTraversableHandle handle_;
+
+  void allocateResultBuffer(SpatialIndexContext& ctx, uint32_t capacity) const;
+
+  void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<point_t> points,
+                        bool counting) const;
+
+  void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<box_t> rects,
+                        bool counting) const;
+
+  void handleBuildBox(SpatialIndexContext& ctx, ArrayView<point_t> points,
+                      bool counting) const;
+
+  void handleBuildBox(SpatialIndexContext& ctx, ArrayView<box_t> rects,
+                      bool counting) const;
+
+  void prepareLaunchParamsBoxQuery(SpatialIndexContext& ctx, ArrayView<box_t> probe_rects,
+                                   bool forward, bool counting) const;
+
+  void filter(SpatialIndexContext& ctx, uint32_t dim_x) const;
+
+  size_t numGeometries() const {
+    return indexing_points_ ? points_.size() : rects_.size();
+  }
+};
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
similarity index 53%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
index 6c836dfa9..b34475edd 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
@@ -16,13 +16,31 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/index/streaming_joiner.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
 
 #include <memory>
+#include <thread>
 
 namespace gpuspatial {
-std::unique_ptr<StreamingJoiner> CreateSpatialJoiner();
 
-void InitSpatialJoiner(StreamingJoiner* index, const char* ptx_root,
-                       uint32_t concurrency);
+struct RTSpatialIndexConfig {
+  std::shared_ptr<RTEngine> rt_engine;
+  // Prefer fast build the BVH
+  bool prefer_fast_build = false;
+  // Compress the BVH to save memory
+  bool compact = true;
+  // How many threads are allowed to call PushProbe concurrently
+  uint32_t concurrency = 1;
+  // number of points to represent an AABB when doing point-point queries
+  uint32_t n_points_per_aabb = 8;
+  RTSpatialIndexConfig() : prefer_fast_build(false), compact(false) {
+    concurrency = std::thread::hardware_concurrency();
+  }
+};
+
+template <typename SCALAR_T, int N_DIM>
+std::unique_ptr<SpatialIndex<SCALAR_T, N_DIM>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp
new file mode 100644
index 000000000..688d0a9b6
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+namespace gpuspatial {
+template <typename SCALAR_T, int N_DIM>
+class SpatialIndex {
+ public:
+  using point_t = Point<SCALAR_T, N_DIM>;
+  using box_t = Box<point_t>;
+
+  virtual ~SpatialIndex() = default;
+
+  /**
+   * Provide an array of geometries to build the index.
+   * @param rects An array of rectangles to be indexed.
+   */
+  virtual void PushBuild(const box_t* rects, uint32_t n_rects) = 0;
+
+  /**
+   * Waiting the index to be built.
+   * This method should be called after all geometries have been pushed.
+   */
+  virtual void FinishBuilding() = 0;
+
+  /**
+   * Remove all geometries from the index, so the index can reused.
+   */
+  virtual void Clear() = 0;
+
+  /**
+   * Query the index with an array of rectangles and return the indices of
+   * the rectangles. This method is thread-safe.
+   * @param build_indices A vector to store the indices of the geometries in the index
+   * that have a spatial overlap with the geometries in the stream.
+   * @param stream_indices A vector to store the indices of the geometries in the stream
+   * that have a spatial overlap with the geometries in the index.
+   */
+  virtual void Probe(const box_t* rects, uint32_t n_rects,
+                     std::vector<uint32_t>* build_indices,
+                     std::vector<uint32_t>* stream_indices) {
+    throw std::runtime_error("Not implemented");
+  }
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh
deleted file mode 100644
index 1c93a54b2..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "geoarrow/geoarrow_type.h"
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/index/detail/rt_engine.hpp"
-#include "gpuspatial/index/geometry_grouper.hpp"
-#include "gpuspatial/index/object_pool.hpp"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/index/streaming_joiner.hpp"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/utils/gpu_timer.hpp"
-#include "gpuspatial/utils/queue.h"
-#include "gpuspatial/utils/thread_pool.h"
-
-#include "rmm/cuda_stream_pool.hpp"
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-
-#include <fstream>
-#include <thread>
-
-
-// #define GPUSPATIAL_PROFILING
-namespace gpuspatial {
-
-class SpatialJoiner : public StreamingJoiner {
-  // TODO: Assuming every thing is 2D in double for now
-  using scalar_t = double;
-  static constexpr int n_dim = 2;
-  using index_t = uint32_t;  // type of the index to represent geometries
-  // geometry types
-  using point_t = Point<scalar_t, n_dim>;
-  using multi_point_t = MultiPoint<point_t>;
-  using line_string_t = LineString<point_t>;
-  using multi_line_string_t = MultiLineString<point_t, index_t>;
-  using polygon_t = Polygon<point_t, index_t>;
-  using multi_polygon_t = MultiPolygon<point_t, index_t>;
-  // geometry array types
-  using point_array_t = PointArrayView<point_t, index_t>;
-  using multi_point_array_t = MultiPointArrayView<point_t, index_t>;
-  using line_string_array_t = LineStringArrayView<point_t, index_t>;
-  using multi_line_string_array_t = MultiLineStringArrayView<point_t, index_t>;
-  using polygon_array_t = PolygonArrayView<point_t, index_t>;
-  using multi_polygon_array_t = MultiPolygonArrayView<point_t, index_t>;
-
-  using dev_geometries_t = DeviceGeometries<point_t, index_t>;
-  using box_t = Box<Point<float, n_dim>>;
-  using loader_t = ParallelWkbLoader<point_t, index_t>;
-
- public:
-  struct SpatialJoinerConfig : Config {
-    const char* ptx_root;
-    // Prefer fast build the BVH
-    bool prefer_fast_build = false;
-    // Compress the BVH to save memory
-    bool compact = true;
-    // Loader configurations
-    // How many threads to use for parsing WKBs
-    uint32_t parsing_threads = std::thread::hardware_concurrency();
-    // How many threads are allowed to call PushStream concurrently
-    uint32_t concurrency = 1;
-    // number of points to represent an AABB when doing point-point queries
-    uint32_t n_points_per_aabb = 8;
-    // reserve a ratio of available memory for result sets
-    float result_buffer_memory_reserve_ratio = 0.2;
-    // the memory quota for relate engine compared to the available memory
-    float relate_engine_memory_quota = 0.8;
-    // this value determines RELATE_MAX_DEPTH
-    size_t stack_size_bytes = 3 * 1024;
-    SpatialJoinerConfig() : ptx_root(nullptr), prefer_fast_build(false), compact(false) {
-      concurrency = std::thread::hardware_concurrency();
-    }
-  };
-
-  struct SpatialJoinerContext : Context {
-    rmm::cuda_stream_view cuda_stream;
-    std::string shader_id;
-    std::unique_ptr<loader_t> stream_loader;
-    dev_geometries_t stream_geometries;
-    std::unique_ptr<rmm::device_buffer> bvh_buffer;
-    OptixTraversableHandle handle;
-    std::vector<char> h_launch_params_buffer;
-    std::unique_ptr<rmm::device_buffer> launch_params_buffer;
-    // output
-    Queue<thrust::pair<index_t, index_t>> results;
-    int32_t array_index_offset;
-#ifdef GPUSPATIAL_PROFILING
-    GPUTimer timer;
-    // counters
-    double parse_ms = 0.0;
-    double alloc_ms = 0.0;
-    double filter_ms = 0.0;
-    double refine_ms = 0.0;
-    double copy_res_ms = 0.0;
-#endif
-  };
-
-  SpatialJoiner() = default;
-
-  ~SpatialJoiner() = default;
-
-  void Init(const Config* config) override;
-
-  void Clear() override;
-
-  void PushBuild(const ArrowSchema* schema, const ArrowArray* array, int64_t offset,
-                 int64_t length) override;
-
-  void FinishBuilding() override;
-
-  std::shared_ptr<Context> CreateContext() override { return ctx_pool_->take(); }
-
-  void PushStream(Context* ctx, const ArrowSchema* schema, const ArrowArray* array,
-                  int64_t offset, int64_t length, Predicate predicate,
-                  std::vector<uint32_t>* build_indices,
-                  std::vector<uint32_t>* stream_indices,
-                  int32_t array_index_offset) override;
-
-  // Internal method but has to be public for the CUDA kernel to access
-  void handleBuildPointStreamPoint(SpatialJoinerContext* ctx, Predicate predicate,
-                                   std::vector<uint32_t>* build_indices,
-                                   std::vector<uint32_t>* stream_indices);
-
-  void handleBuildBoxStreamPoint(SpatialJoinerContext* ctx, Predicate predicate,
-                                 std::vector<uint32_t>* build_indices,
-                                 std::vector<uint32_t>* stream_indices);
-
-  void handleBuildPointStreamBox(SpatialJoinerContext* ctx, Predicate predicate,
-                                 std::vector<uint32_t>* build_indices,
-                                 std::vector<uint32_t>* stream_indices);
-
-  void handleBuildBoxStreamBox(SpatialJoinerContext* ctx, Predicate predicate,
-                               std::vector<uint32_t>* build_indices,
-                               std::vector<uint32_t>* stream_indices);
-
-  void filter(SpatialJoinerContext* ctx, uint32_t dim_x, bool swap_id = false);
-
-  void refine(SpatialJoinerContext* ctx, Predicate predicate,
-              std::vector<uint32_t>* build_indices,
-              std::vector<uint32_t>* stream_indices);
-
- private:
-  SpatialJoinerConfig config_;
-  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
-  std::shared_ptr<ThreadPool> thread_pool_;
-  details::RTEngine rt_engine_;
-  std::unique_ptr<rmm::device_buffer> bvh_buffer_;
-  std::unique_ptr<loader_t> build_loader_;
-
-  DeviceGeometries<point_t, index_t> build_geometries_;
-  // For grouping points with space-filing curve
-  GeometryGrouper<point_t, index_t> geometry_grouper_;
-  RelateEngine<point_t, index_t> relate_engine_;
-  OptixTraversableHandle handle_;
-
-  std::shared_ptr<ObjectPool<SpatialJoinerContext>> ctx_pool_;
-
-  OptixTraversableHandle buildBVH(const rmm::cuda_stream_view& stream,
-                                  const ArrayView<OptixAabb>& aabbs,
-                                  std::unique_ptr<rmm::device_buffer>& buffer);
-
-  void allocateResultBuffer(SpatialJoinerContext* ctx);
-
-  void prepareLaunchParamsBoxQuery(SpatialJoinerContext* ctx, bool forward);
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp
deleted file mode 100644
index ccf8a3bfe..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/relate/predicate.cuh"
-
-#include "nanoarrow/nanoarrow.hpp"
-
-#include <memory>
-#include <stdexcept>
-#include <vector>
-namespace gpuspatial {
-
-class StreamingJoiner {
- public:
-  struct Context {
-    virtual ~Context() = default;
-  };
-
-  struct Config {
-    virtual ~Config() = default;
-  };
-
-  virtual ~StreamingJoiner() = default;
-
-  /**
-   * Initialize the index with the given configuration. This method should be called only
-   * once before using the index.
-   * @param config
-   */
-  virtual void Init(const Config* config) = 0;
-
-  /**
-   * Provide an array of geometries to build the index.
-   * @param array ArrowArray that contains the geometries in WKB format.
-   * @param offset starting index of the ArrowArray
-   * @param length length of the ArrowArray to read.
-   */
-  virtual void PushBuild(const ArrowSchema* schema, const ArrowArray* array,
-                         int64_t offset, int64_t length) = 0;
-
-  /**
-   * Waiting the index to be built.
-   * This method should be called after all geometries have been pushed.
-   */
-  virtual void FinishBuilding() = 0;
-
-  /**
-   * Remove all geometries from the index, so the index can reused.
-   */
-  virtual void Clear() = 0;
-
-  /**
-   * Query the index with an array of geometries in WKB format and return the indices of
-   * the geometries in stream and the index that satisfy a given predicate. This method is
-   * thread-safe.
-   * @param context A context object that can be used to store intermediate results.
-   * @param array ArrowArray that contains the geometries in WKB format.
-   * @param offset starting index of the ArrowArray
-   * @param length length of the ArrowArray to read.
-   * @param predicate A predicate to filter the query results.
-   * @param build_indices A vector to store the indices of the geometries in the index
-   * that have a spatial overlap with the geometries in the stream.
-   * @param stream_indices A vector to store the indices of the geometries in the stream
-   * that have a spatial overlap with the geometries in the index.
-   * @param stream_index_offset An offset to be added to stream_indices
-   */
-  virtual void PushStream(Context* context, const ArrowSchema* schema,
-                          const ArrowArray* array, int64_t offset, int64_t length,
-                          Predicate predicate, std::vector<uint32_t>* build_indices,
-                          std::vector<uint32_t>* stream_indices,
-                          int32_t stream_index_offset) {
-    throw std::runtime_error("Not implemented");
-  }
-
-  /**
-   * Create a context object for issuing queries against the index.
-   * @return A context object that is used to store intermediate results.
-   */
-  virtual std::shared_ptr<Context> CreateContext() {
-    throw std::runtime_error("Not implemented");
-  }
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.hpp
index 3c44ca324..2d59d0a89 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/device_geometries.hpp
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/geometry_type.cuh"
-#include "gpuspatial/geom/multi_line_string.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/utils/array_view.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/geometry_type.hpp"
+#include "gpuspatial/geom/multi_line_string.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/utils/array_view.hpp"
 
 #include "rmm/device_uvector.hpp"
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.hpp
similarity index 72%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.hpp
index cb2186ff3..ff9962bcf 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.hpp
@@ -15,78 +15,41 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-
-#include "gpuspatial/geom/geometry_type.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
+#include "gpuspatial/geom/geometry_type.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/mem/memory_manager.hpp"
 #include "gpuspatial/utils/logger.hpp"
+#include "gpuspatial/utils/markers.hpp"
 #include "gpuspatial/utils/mem_utils.hpp"
-#include "gpuspatial/utils/stopwatch.h"
-#include "gpuspatial/utils/thread_pool.h"
+#include "gpuspatial/utils/stopwatch.hpp"
+#include "gpuspatial/utils/thread_pool.hpp"
+
+#include "nanoarrow/nanoarrow.hpp"
 
-#include "nanoarrow/nanoarrow.h"
+#include "geoarrow/geoarrow.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_uvector.hpp"
 #include "rmm/exec_policy.hpp"
 
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
 
+#include <sys/sysinfo.h>
+
+#include <cstring>
+#include <future>
+#include <numeric>
 #include <thread>
 #include <unordered_set>
-
-#include <sys/sysinfo.h>
-#include <unistd.h>
+#include <vector>
 
 namespace gpuspatial {
 namespace detail {
 
-inline long long get_free_physical_memory_linux() {
-  struct sysinfo info;
-  if (sysinfo(&info) == 0) {
-    // info.freeram is in bytes (or unit defined by info.mem_unit)
-    // Use info.freeram * info.mem_unit for total free bytes
-    return (long long)info.freeram * (long long)info.mem_unit;
-  }
-  return 0;  // Error
-}
-
-// Copied from GeoArrow, it is faster than using GeoArrowWKBReaderRead
-struct WKBReaderPrivate {
-  const uint8_t* data;
-  int64_t size_bytes;
-  const uint8_t* data0;
-  int need_swapping;
-  GeoArrowGeometry geom;
-};
-
-static int WKBReaderReadEndian(struct WKBReaderPrivate* s, struct GeoArrowError* error) {
-  if (s->size_bytes > 0) {
-    s->need_swapping = s->data[0] != GEOARROW_NATIVE_ENDIAN;
-    s->data++;
-    s->size_bytes--;
-    return GEOARROW_OK;
-  } else {
-    GeoArrowErrorSet(error, "Expected endian byte but found end of buffer at byte %ld",
-                     (long)(s->data - s->data0));
-    return EINVAL;
-  }
-}
-
-static int WKBReaderReadUInt32(struct WKBReaderPrivate* s, uint32_t* out,
-                               struct GeoArrowError* error) {
-  if (s->size_bytes >= 4) {
-    memcpy(out, s->data, sizeof(uint32_t));
-    s->data += sizeof(uint32_t);
-    s->size_bytes -= sizeof(uint32_t);
-    if (s->need_swapping) {
-      *out = __builtin_bswap32(*out);
-    }
-    return GEOARROW_OK;
-  } else {
-    GeoArrowErrorSet(error, "Expected uint32 but found end of buffer at byte %ld",
-                     (long)(s->data - s->data0));
-    return EINVAL;
-  }
+inline bool is_little_endian() {
+  const uint16_t x = 0x0001;
+  return *reinterpret_cast<const uint8_t*>(&x) != 0;
 }
 
 /**
@@ -105,6 +68,7 @@ template <typename POINT_T, typename INDEX_T>
 struct HostParsedGeometries {
   constexpr static int n_dim = POINT_T::n_dim;
   using mbr_t = Box<Point<float, n_dim>>;
+  GeometryType type;  // A general type that can reprs
   // each feature should have only one type except GeometryCollection
   std::vector<GeometryType> feature_types;
   // This number should be one except GeometryCollection, which should be unnested # of
@@ -120,17 +84,18 @@ struct HostParsedGeometries {
   bool has_geometry_collection = false;
   bool create_mbr = false;
 
-  HostParsedGeometries(bool multi_, bool has_geometry_collection_, bool create_mbr_) {
+  HostParsedGeometries(GeometryType t) : type(t) {
+    multi = type == GeometryType::kMultiPoint || type == GeometryType::kMultiLineString ||
+            type == GeometryType::kMultiPolygon;
+    has_geometry_collection = type == GeometryType::kGeometryCollection;
+    create_mbr = type != GeometryType::kPoint;
     // Multi and GeometryCollection are mutually exclusive
-    assert(!(multi_ && has_geometry_collection_));
-    multi = multi_;
-    has_geometry_collection = has_geometry_collection_;
-    create_mbr = create_mbr_;
+    assert(!(multi && has_geometry_collection));
   }
 
   void AddGeometry(const GeoArrowGeometryView* geom) {
     if (geom == nullptr) {
-      throw std::runtime_error("Null geometry not supported yet");
+      addNullEntry();
       return;
     }
 
@@ -405,6 +370,49 @@ struct HostParsedGeometries {
     }
     return node + 1;
   }
+
+  void addNullEntry() {
+    // 1. Maintain MBR alignment if this type has MBRs
+    if (create_mbr) {
+      mbr_t empty_mbr;
+      empty_mbr.set_empty();
+      mbrs.push_back(empty_mbr);
+    }
+
+    // 2. Push zero-placeholders to maintain offset alignment
+    if (has_geometry_collection) {
+      // Null collection => 0 sub-geometries
+      num_geoms.push_back(0);
+    } else {
+      switch (type) {
+        case GeometryType::kPoint: {
+          // Push NaN point to represent empty/null
+          POINT_T p;
+          p.set_empty();
+          vertices.push_back(p);
+          break;
+        }
+        case GeometryType::kLineString:
+          num_points.push_back(0);
+          break;
+        case GeometryType::kPolygon:
+          num_rings.push_back(0);
+          break;
+        case GeometryType::kMultiPoint:
+          num_points.push_back(0);
+          break;
+        case GeometryType::kMultiLineString:
+          num_parts.push_back(0);
+          break;
+        case GeometryType::kMultiPolygon:
+          num_parts.push_back(0);
+          break;
+        default:
+          throw std::runtime_error(
+              "Null geometry encountered for unsupported geometry type");
+      }
+    }
+  }
 };
 
 template <typename POINT_T, typename INDEX_T>
@@ -442,7 +450,8 @@ struct DeviceParsedGeometries {
   }
 
   void Append(rmm::cuda_stream_view stream,
-              const std::vector<HostParsedGeometries<POINT_T, INDEX_T>>& host_geoms) {
+              const std::vector<HostParsedGeometries<POINT_T, INDEX_T>>& host_geoms,
+              double& t_alloc_ms, double& t_copy_ms) {
     size_t sz_feature_types = 0;
     size_t sz_num_geoms = 0;
     size_t sz_num_parts = 0;
@@ -482,6 +491,9 @@ struct DeviceParsedGeometries {
         prev_sz_mbrs * sizeof(mbr_t) / 1024 / 1024,
         sz_mbrs * sizeof(mbr_t) / 1024 / 1024);
 
+    Stopwatch sw;
+
+    sw.start();
     feature_types.resize(feature_types.size() + sz_feature_types, stream);
     num_geoms.resize(num_geoms.size() + sz_num_geoms, stream);
     num_parts.resize(num_parts.size() + sz_num_parts, stream);
@@ -489,7 +501,11 @@ struct DeviceParsedGeometries {
     num_points.resize(num_points.size() + sz_num_points, stream);
     vertices.resize(vertices.size() + sz_vertices, stream);
     mbrs.resize(mbrs.size() + sz_mbrs, stream);
-
+    stream.synchronize();
+    sw.stop();
+    t_alloc_ms += sw.ms();
+    Instrument::Range r("H2D", gpuspatial::Color::Blue);
+    sw.start();
     for (auto& geoms : host_geoms) {
       detail::async_copy_h2d(stream, geoms.feature_types.data(),
                              feature_types.data() + prev_sz_feature_types,
@@ -518,6 +534,9 @@ struct DeviceParsedGeometries {
       prev_sz_vertices += geoms.vertices.size();
       prev_sz_mbrs += geoms.mbrs.size();
     }
+    stream.synchronize();
+    sw.stop();
+    t_copy_ms += sw.ms();
   }
 };
 }  // namespace detail
@@ -531,9 +550,7 @@ class ParallelWkbLoader {
 
  public:
   struct Config {
-    // How many rows of WKBs to process in one chunk
-    // This value affects the peak memory usage and overheads
-    int chunk_size = 16 * 1024;
+    float memory_quota = 0.8f;  // percentage of free memory to use
   };
 
   ParallelWkbLoader()
@@ -543,9 +560,8 @@ class ParallelWkbLoader {
       : thread_pool_(thread_pool) {}
 
   void Init(const Config& config = Config()) {
-    ArrowArrayViewInitFromType(&array_view_, NANOARROW_TYPE_BINARY);
     config_ = config;
-    geometry_type_ = GeometryType::kNull;
+    Clear(rmm::cuda_stream_default);
   }
 
   void Clear(rmm::cuda_stream_view stream) {
@@ -553,72 +569,97 @@ class ParallelWkbLoader {
     geoms_.Clear(stream);
   }
 
-  void Parse(rmm::cuda_stream_view stream, const ArrowArray* array, int64_t offset,
-             int64_t length) {
-    using host_geometries_t = detail::HostParsedGeometries<POINT_T, INDEX_T>;
+  void Parse(rmm::cuda_stream_view stream, const ArrowSchema* schema,
+             const ArrowArray* array, int64_t offset, int64_t length) {
+    auto begin = thrust::make_counting_iterator<int64_t>(offset);
+    auto end = begin + length;
+
+    Parse(stream, schema, array, begin, end);
+  }
+
+  template <typename OFFSET_IT>
+  void Parse(rmm::cuda_stream_view stream, const ArrowSchema* schema,
+             const ArrowArray* array, OFFSET_IT begin, OFFSET_IT end) {
     ArrowError arrow_error;
-    if (ArrowArrayViewSetArray(&array_view_, array, &arrow_error) != NANOARROW_OK) {
+
+    if (ArrowArrayViewInitFromSchema(array_view_.get(), schema, &arrow_error) !=
+        NANOARROW_OK) {
+      throw std::runtime_error("ArrowArrayViewInitFromSchema error " +
+                               std::string(arrow_error.message));
+    }
+    using host_geometries_t = detail::HostParsedGeometries<POINT_T, INDEX_T>;
+
+    size_t num_offsets = std::distance(begin, end);
+    if (num_offsets == 0) return;
+
+    if (ArrowArrayViewSetArray(array_view_.get(), array, &arrow_error) != NANOARROW_OK) {
       throw std::runtime_error("ArrowArrayViewSetArray error " +
                                std::string(arrow_error.message));
     }
+
     auto parallelism = thread_pool_->num_threads();
-    auto est_bytes = estimateTotalBytes(array, offset, length);
-    auto free_memory = detail::get_free_physical_memory_linux();
+    uint64_t est_bytes = estimateTotalBytes(begin, end);
+
+    uint64_t free_memory = MemoryManager::get_available_host_memory();
+    uint64_t memory_quota = free_memory * config_.memory_quota;
     uint32_t est_n_chunks = est_bytes / free_memory + 1;
-    uint32_t chunk_size = (length + est_n_chunks - 1) / est_n_chunks;
+
+    // Use num_offsets instead of offsets.size()
+    uint32_t chunk_size = (num_offsets + est_n_chunks - 1) / est_n_chunks;
+    uint32_t n_chunks = (num_offsets + chunk_size - 1) / chunk_size;
 
     GPUSPATIAL_LOG_INFO(
-        "Parsing %ld rows, est arrow size %ld MB, free memory %lld, chunk size %u\n",
-        length, est_bytes / 1024 / 1024, free_memory / 1024 / 1024, chunk_size);
+        "Parsing %zu rows, est ArrowArray size %lu MB, Free Host Memory %lu MB, Memory quota %lu MB, Chunk Size %u, Total Chunks %u",
+        num_offsets, est_bytes / 1024 / 1024, free_memory / 1024 / 1024,
+        memory_quota / 1024 / 1024, chunk_size, n_chunks);
 
-    auto n_chunks = (length + chunk_size - 1) / chunk_size;
     Stopwatch sw;
     double t_fetch_type = 0, t_parse = 0, t_copy = 0;
+    double t_alloc = 0, t_h2d = 0;
 
     sw.start();
-    updateGeometryType(offset, length);
+    // Assumption: updateGeometryType is updated to accept iterators (begin, end)
+    updateGeometryType(begin, end);
     sw.stop();
     t_fetch_type = sw.ms();
 
-    bool multi = geometry_type_ == GeometryType::kMultiPoint ||
-                 geometry_type_ == GeometryType::kMultiLineString ||
-                 geometry_type_ == GeometryType::kMultiPolygon;
-    bool has_geometry_collection = geometry_type_ == GeometryType::kGeometryCollection;
-    bool create_mbr = geometry_type_ != GeometryType::kPoint;
-
     // reserve space
     geoms_.vertices.reserve(est_bytes / sizeof(POINT_T), stream);
-    if (create_mbr) geoms_.mbrs.reserve(array->length, stream);
+    if (geometry_type_ != GeometryType::kPoint)
+      geoms_.mbrs.reserve(array->length, stream);
 
     // Batch processing to reduce the peak memory usage
-    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+    for (size_t chunk = 0; chunk < n_chunks; chunk++) {
       auto chunk_start = chunk * chunk_size;
-      auto chunk_end = std::min(length, (chunk + 1) * chunk_size);
-      auto work_size = chunk_end - chunk_start;
+      auto chunk_end = std::min(num_offsets, (chunk + 1) * chunk_size);
+      auto split_points =
+          assignBalancedWorks(begin + chunk_start, begin + chunk_end, parallelism);
 
       std::vector<std::future<host_geometries_t>> pending_local_geoms;
-      auto thread_work_size = (work_size + parallelism - 1) / parallelism;
-      sw.start();
       // Each thread will parse in parallel and store results sequentially
       for (int thread_idx = 0; thread_idx < parallelism; thread_idx++) {
         auto run = [&](int tid) {
-          // FIXME: SetDevice
-          auto thread_work_start = chunk_start + tid * thread_work_size;
-          auto thread_work_end =
-              std::min(chunk_end, thread_work_start + thread_work_size);
-          host_geometries_t local_geoms(multi, has_geometry_collection, create_mbr);
+          auto thread_work_start = split_points[tid];
+          auto thread_work_end = split_points[tid + 1];
+          host_geometries_t local_geoms(geometry_type_);
           GeoArrowWKBReader reader;
           GeoArrowError error;
-          GEOARROW_THROW_NOT_OK(nullptr, GeoArrowWKBReaderInit(&reader));
+          GEOARROW_THROW_NOT_OK(&error, GeoArrowWKBReaderInit(&reader));
+
+          uint64_t chunk_bytes =
+              estimateTotalBytes(begin + thread_work_start, begin + thread_work_end);
+          local_geoms.vertices.reserve(chunk_bytes / sizeof(POINT_T));
 
           for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
                work_offset++) {
-            auto arrow_offset = work_offset + offset;
+            // Use iterator indexing (Requires RandomAccessIterator)
+            auto arrow_offset = begin[chunk_start + work_offset];
+
             // handle null value
-            if (ArrowArrayViewIsNull(&array_view_, arrow_offset)) {
+            if (ArrowArrayViewIsNull(array_view_.get(), arrow_offset)) {
               local_geoms.AddGeometry(nullptr);
             } else {
-              auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, arrow_offset);
+              auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), arrow_offset);
               GeoArrowGeometryView geom;
 
               GEOARROW_THROW_NOT_OK(
@@ -629,6 +670,7 @@ class ParallelWkbLoader {
             }
           }
 
+          GeoArrowWKBReaderReset(&reader);
           return std::move(local_geoms);
         };
         pending_local_geoms.push_back(std::move(thread_pool_->enqueue(run, thread_idx)));
@@ -641,15 +683,14 @@ class ParallelWkbLoader {
       sw.stop();
       t_parse += sw.ms();
       sw.start();
-      geoms_.Append(stream, local_geoms);
+      geoms_.Append(stream, local_geoms, t_alloc, t_h2d);
       stream.synchronize();
       sw.stop();
       t_copy += sw.ms();
     }
     GPUSPATIAL_LOG_INFO(
-        "ParallelWkbLoader::Parse: fetched type in %.3f ms, parsed in %.3f ms, copied in "
-        "%.3f ms",
-        t_fetch_type, t_parse, t_copy);
+        "ParallelWkbLoader::Parse: fetched type in %.3f ms, parsed in %.3f ms, alloc %.3f ms, h2d copy %.3f ms",
+        t_fetch_type, t_parse, t_alloc, t_h2d);
   }
 
   DeviceGeometries<POINT_T, INDEX_T> Finish(rmm::cuda_stream_view stream) {
@@ -746,8 +787,10 @@ class ParallelWkbLoader {
             std::move(ps_num_points);
         break;
       }
+      default:
+        throw std::runtime_error("Unsupported geometry type " +
+                                 GeometryTypeToString(geometry_type_) + " in Finish");
     }
-    Clear(stream);
     stream.synchronize();
     sw.stop();
     GPUSPATIAL_LOG_INFO("Finish building DeviceGeometries in %.3f ms", sw.ms());
@@ -756,102 +799,99 @@ class ParallelWkbLoader {
 
  private:
   Config config_;
-  ArrowArrayView array_view_;
+  nanoarrow::UniqueArrayView array_view_;
   GeometryType geometry_type_;
   detail::DeviceParsedGeometries<POINT_T, INDEX_T> geoms_;
   std::shared_ptr<ThreadPool> thread_pool_;
 
-  void updateGeometryType(int64_t offset, int64_t length) {
+  template <typename OFFSET_IT>
+  void updateGeometryType(OFFSET_IT begin, OFFSET_IT end) {
     if (geometry_type_ == GeometryType::kGeometryCollection) {
-      // it's already the most generic type
       return;
     }
 
-    std::vector<bool> type_flags(8 /*WKB types*/, false);
-    std::vector<std::thread> workers;
+    size_t num_offsets = std::distance(begin, end);
+    if (num_offsets == 0) return;
+
     auto parallelism = thread_pool_->num_threads();
-    auto thread_work_size = (length + parallelism - 1) / parallelism;
-    std::vector<std::future<void>> futures;
+    auto thread_work_size = (num_offsets + parallelism - 1) / parallelism;
+
+    std::vector<std::future<uint32_t>> futures;
+    futures.reserve(parallelism);
+
+    // Detect Endianness once (outside the loop)
+    const bool host_is_little = detail::is_little_endian();
 
     for (int thread_idx = 0; thread_idx < parallelism; thread_idx++) {
-      auto run = [&](int tid) {
-        auto thread_work_start = tid * thread_work_size;
-        auto thread_work_end = std::min(length, thread_work_start + thread_work_size);
-        GeoArrowWKBReader reader;
-        GeoArrowError error;
-        GEOARROW_THROW_NOT_OK(nullptr, GeoArrowWKBReaderInit(&reader));
+      auto run = [=](int tid) -> uint32_t {
+        size_t thread_work_start = tid * thread_work_size;
+        size_t thread_work_end =
+            std::min(num_offsets, thread_work_start + thread_work_size);
+
+        uint32_t local_seen_mask = 0;
 
         for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
              work_offset++) {
-          auto arrow_offset = work_offset + offset;
-          // handle null value
-          if (ArrowArrayViewIsNull(&array_view_, arrow_offset)) {
+          auto arrow_offset = begin[work_offset];
+
+          if (ArrowArrayViewIsNull(array_view_.get(), arrow_offset)) {
             continue;
           }
-          auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, arrow_offset);
-          auto* s = (struct detail::WKBReaderPrivate*)reader.private_data;
 
-          s->data = item.data.as_uint8;
-          s->data0 = s->data;
-          s->size_bytes = item.size_bytes;
+          auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), arrow_offset);
+
+          // Safety check: WKB minimal size is 5 bytes (1 byte order + 4 type)
+          if (item.size_bytes < 5) continue;
 
-          NANOARROW_THROW_NOT_OK(detail::WKBReaderReadEndian(s, &error));
+          const uint8_t* data = item.data.as_uint8;
+
+          // 1. Read Endianness Byte (0 = Big/XDR, 1 = Little/NDR)
+          uint8_t wkb_endian = data[0];
+
+          // 2. Read Type (Bytes 1-4)
           uint32_t geometry_type;
-          NANOARROW_THROW_NOT_OK(detail::WKBReaderReadUInt32(s, &geometry_type, &error));
+          std::memcpy(&geometry_type, data + 1, sizeof(uint32_t));
+
+          // 3. Swap if mismatch
+          // If (WKB is Little) != (Host is Little), we must swap
+          if ((wkb_endian == 1) != host_is_little) {
+            geometry_type = __builtin_bswap32(geometry_type);
+          }
+
+          // 4. Validate and Accumulate (Branchless Masking)
           if (geometry_type > 7) {
-            throw std::runtime_error(
-                "Extended WKB types are not currently supported, type = " +
-                std::to_string(geometry_type));
+            // It's safer to throw exception outside the tight loop or set an error flag
+            // For now, we skip or you can throw.
+            throw std::runtime_error("Extended WKB types not supported: " +
+                                     std::to_string(geometry_type));
           }
-          assert(geometry_type < type_flags.size());
-          type_flags[geometry_type] = true;
+
+          local_seen_mask |= (1 << geometry_type);
         }
+        return local_seen_mask;
       };
+
       futures.push_back(std::move(thread_pool_->enqueue(run, thread_idx)));
     }
+
+    // Reduction
+    uint32_t global_mask = 0;
     for (auto& fu : futures) {
-      fu.get();
+      global_mask |= fu.get();
     }
 
     std::unordered_set<GeometryType> types;
-    // include existing geometry type
     if (geometry_type_ != GeometryType::kNull) {
       types.insert(geometry_type_);
     }
 
     for (int i = 1; i <= 7; i++) {
-      if (type_flags[i]) {
+      if (global_mask & (1 << i)) {
         types.insert(static_cast<GeometryType>(i));
       }
     }
 
-    GeometryType final_type;
-    // Infer a generic type that can represent the current and previous types
-    switch (types.size()) {
-      case 0:
-        final_type = GeometryType::kNull;
-        break;
-      case 1:
-        final_type = *types.begin();
-        break;
-      case 2: {
-        if (types.count(GeometryType::kPoint) && types.count(GeometryType::kMultiPoint)) {
-          final_type = GeometryType::kMultiPoint;
-        } else if (types.count(GeometryType::kLineString) &&
-                   types.count(GeometryType::kMultiLineString)) {
-          final_type = GeometryType::kMultiLineString;
-        } else if (types.count(GeometryType::kPolygon) &&
-                   types.count(GeometryType::kMultiPolygon)) {
-          final_type = GeometryType::kMultiPolygon;
-        } else {
-          final_type = GeometryType::kGeometryCollection;
-        }
-        break;
-      }
-      default:
-        final_type = GeometryType::kGeometryCollection;
-    }
-    geometry_type_ = final_type;
+    geometry_type_ = getUpcastedGeometryType(types);
   }
 
   template <typename T>
@@ -875,21 +915,107 @@ class ParallelWkbLoader {
     nums.shrink_to_fit(stream);
   }
 
-  size_t estimateTotalBytes(const ArrowArray* array, int64_t offset, int64_t length) {
-    ArrowError arrow_error;
-    if (ArrowArrayViewSetArray(&array_view_, array, &arrow_error) != NANOARROW_OK) {
-      throw std::runtime_error("ArrowArrayViewSetArray error " +
-                               std::string(arrow_error.message));
-    }
+  template <typename OFFSET_IT>
+  size_t estimateTotalBytes(OFFSET_IT begin, OFFSET_IT end) const {
     size_t total_bytes = 0;
-    for (int64_t i = 0; i < length; i++) {
-      if (!ArrowArrayViewIsNull(&array_view_, offset + i)) {
-        auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, offset + i);
+    for (auto it = begin; it != end; ++it) {
+      auto offset = *it;
+      if (!ArrowArrayViewIsNull(array_view_.get(), offset)) {
+        auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), offset);
         total_bytes += item.size_bytes - 1      // byte order
                        - 2 * sizeof(uint32_t);  // type + size
       }
     }
     return total_bytes;
   }
+
+  template <typename OFFSET_IT>
+  std::vector<uint32_t> assignBalancedWorks(OFFSET_IT begin, OFFSET_IT end,
+                                            uint32_t num_threads) const {
+    size_t total_bytes = 0;
+    std::vector<uint32_t> bytes_per_row;
+    size_t num_rows = std::distance(begin, end);
+
+    bytes_per_row.resize(num_rows, 0);
+
+    // 1. Calculate bytes per row
+    for (auto it = begin; it != end; ++it) {
+      auto offset = *it;
+      if (!ArrowArrayViewIsNull(array_view_.get(), offset)) {
+        auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), offset);
+        // Assuming item.size_bytes fits in uint32_t based on vector definition
+        bytes_per_row[it - begin] = static_cast<uint32_t>(item.size_bytes);
+      }
+    }
+
+    // 2. Calculate prefix sum
+    // We use size_t (or uint64_t) for the sum to prevent overflow
+    std::vector<size_t> prefix_sum;
+    prefix_sum.reserve(num_rows + 1);
+    prefix_sum.push_back(0);
+
+    for (uint32_t b : bytes_per_row) {
+      total_bytes += b;
+      prefix_sum.push_back(total_bytes);
+    }
+
+    // 3. Calculate balanced split points
+    std::vector<uint32_t> split_points;
+    split_points.reserve(num_threads + 1);
+    split_points.push_back(0);  // The start index for the first thread
+
+    // Avoid division by zero
+    if (num_threads > 0) {
+      double ideal_chunk_size = static_cast<double>(total_bytes) / num_threads;
+
+      for (uint32_t i = 1; i < num_threads; ++i) {
+        auto target_size = static_cast<size_t>(i * ideal_chunk_size);
+
+        // Find the first index where cumulative bytes >= target_size
+        auto it = std::lower_bound(prefix_sum.begin(), prefix_sum.end(), target_size);
+
+        // Convert iterator to index (row number)
+        auto split_index = static_cast<uint32_t>(std::distance(prefix_sum.begin(), it));
+        split_points.push_back(split_index);
+      }
+    }
+
+    // Ensure the last point is the total number of rows
+    // If num_threads was 0, this will be the second element (0, num_rows)
+    split_points.push_back(static_cast<uint32_t>(num_rows));
+
+    return split_points;
+  }
+
+  GeometryType getUpcastedGeometryType(
+      const std::unordered_set<GeometryType>& types) const {
+    GeometryType final_type;
+    // Infer a generic type that can represent the current and previous types
+    switch (types.size()) {
+      case 0:
+        final_type = GeometryType::kNull;
+        break;
+      case 1:
+        final_type = *types.begin();
+        break;
+      case 2: {
+        if (types.count(GeometryType::kPoint) && types.count(GeometryType::kMultiPoint)) {
+          final_type = GeometryType::kMultiPoint;
+        } else if (types.count(GeometryType::kLineString) &&
+                   types.count(GeometryType::kMultiLineString)) {
+          final_type = GeometryType::kMultiLineString;
+        } else if (types.count(GeometryType::kPolygon) &&
+                   types.count(GeometryType::kMultiPolygon)) {
+          final_type = GeometryType::kMultiPolygon;
+        } else {
+          final_type = GeometryType::kGeometryCollection;
+        }
+        break;
+      }
+      default:
+        final_type = GeometryType::kGeometryCollection;
+    }
+    return final_type;
+  }
 };
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/mem/memory_manager.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/mem/memory_manager.hpp
new file mode 100644
index 000000000..7160fb6da
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/mem/memory_manager.hpp
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+#include "rmm/mr/device/cuda_async_memory_resource.hpp"
+#include "rmm/mr/device/device_memory_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
+#include "rmm/mr/device/tracking_resource_adaptor.hpp"
+
+#include <memory>
+namespace gpuspatial {
+/** @brief An optional singleton memory manager to use asynchronous memory allocation and
+ * memory pool with RAPIDS's RMM memory resources.
+ * Once the memory manager is initialized, all GPU memory allocations will use the RMM's
+ * memory allocator. The user should call Shutdown() to cleanly release RMM resources
+ * before program exit.
+ */
+class MemoryManager {
+ public:
+  static MemoryManager& instance();
+
+  MemoryManager(const MemoryManager&) = delete;
+  MemoryManager& operator=(const MemoryManager&) = delete;
+
+  /**
+   * @brief Initializes the memory resources.
+   * @param use_pool Whether to use RMM pool allocator
+   * @param init_pool_precent Initial pool size as percent of total GPU memory
+   */
+  void Init(bool use_pool, int init_pool_precent = 50);
+
+  /**
+   * @brief Estimates free memory available in bytes
+   * * If using a pool: Returns (Total GPU Mem - Tracked Bytes) * 0.95 safety factor.
+   * If direct: Returns actual CUDA free memory.
+   */
+  size_t get_available_device_memory() const;
+
+  /**
+   * @brief Estimates free host memory available in bytes
+   */
+  static size_t get_available_host_memory();
+  /**
+   * @brief Cleanly resets RMM resources. Automatically called on destruction.
+   */
+  void Shutdown();
+
+ private:
+  MemoryManager() = default;
+  ~MemoryManager();
+
+  // --- Type Aliases ---
+  using CudaMR = rmm::mr::cuda_async_memory_resource;
+  using PoolMR = rmm::mr::pool_memory_resource<CudaMR>;
+
+  // We have two possible tracker types depending on configuration
+  using PoolTracker = rmm::mr::tracking_resource_adaptor<PoolMR>;
+  using CudaTracker = rmm::mr::tracking_resource_adaptor<CudaMR>;
+
+  // --- State ---
+  bool is_initialized_ = false;
+  bool use_pool_ = false;
+
+  std::unique_ptr<CudaMR> cuda_mr_;
+  std::unique_ptr<PoolMR> pool_mr_;
+  std::unique_ptr<rmm::mr::device_memory_resource> active_resource_;
+
+  void* raw_tracker_ptr_ = nullptr;
+};
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
new file mode 100644
index 000000000..4a9c3112c
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "gpuspatial/refine/spatial_refiner.hpp"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/gpu_timer.hpp"
+#include "gpuspatial/utils/thread_pool.hpp"
+
+#include "geoarrow/geoarrow_type.h"
+#include "nanoarrow/nanoarrow.h"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/cuda_stream_view.hpp"
+
+#include <thread>
+
+#define GPUSPATIAL_PROFILING
+namespace gpuspatial {
+
+class RTSpatialRefiner : public SpatialRefiner {
+  // TODO: Assuming every thing is 2D in double for now
+  using scalar_t = double;
+  static constexpr int n_dim = 2;
+  using index_t = uint32_t;  // type of the index to represent geometries
+  // geometry types
+  using point_t = Point<scalar_t, n_dim>;
+  using multi_point_t = MultiPoint<point_t>;
+  using line_string_t = LineString<point_t>;
+  using multi_line_string_t = MultiLineString<point_t, index_t>;
+  using polygon_t = Polygon<point_t, index_t>;
+  using multi_polygon_t = MultiPolygon<point_t, index_t>;
+  // geometry array types
+  using point_array_t = PointArrayView<point_t, index_t>;
+  using multi_point_array_t = MultiPointArrayView<point_t, index_t>;
+  using line_string_array_t = LineStringArrayView<point_t, index_t>;
+  using multi_line_string_array_t = MultiLineStringArrayView<point_t, index_t>;
+  using polygon_array_t = PolygonArrayView<point_t, index_t>;
+  using multi_polygon_array_t = MultiPolygonArrayView<point_t, index_t>;
+
+  using dev_geometries_t = DeviceGeometries<point_t, index_t>;
+  using box_t = Box<Point<float, n_dim>>;
+  using loader_t = ParallelWkbLoader<point_t, index_t>;
+
+  static_assert(sizeof(Box<Point<float, 2>>) == sizeof(box_t),
+                "Box<Point<float, 2>> size mismatch!");
+
+ public:
+  struct IndicesMap {
+    // Sorted unique original indices
+    std::vector<uint32_t> h_uniq_indices;
+    rmm::device_uvector<uint32_t> d_uniq_indices{0, rmm::cuda_stream_default};
+    // Mapping from original indices to consecutive zero-based indices
+    rmm::device_uvector<uint32_t> d_reordered_indices{0, rmm::cuda_stream_default};
+  };
+  struct SpatialRefinerContext {
+    rmm::cuda_stream_view cuda_stream;
+#ifdef GPUSPATIAL_PROFILING
+    GPUTimer timer;
+    // counters
+    double parse_ms = 0.0;
+    double alloc_ms = 0.0;
+    double refine_ms = 0.0;
+    double copy_res_ms = 0.0;
+#endif
+  };
+
+  RTSpatialRefiner() = default;
+
+  RTSpatialRefiner(const RTSpatialRefinerConfig& config);
+
+  ~RTSpatialRefiner() = default;
+
+  void Clear() override;
+
+  void PushBuild(const ArrowSchema* build_schema, const ArrowArray* build_array) override;
+
+  void FinishBuilding() override;
+
+  uint32_t Refine(const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                  Predicate predicate, uint32_t* build_indices, uint32_t* probe_indices,
+                  uint32_t len) override;
+
+  uint32_t Refine(const ArrowSchema* build_schema, const ArrowArray* build_array,
+                  const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                  Predicate predicate, uint32_t* build_indices, uint32_t* probe_indices,
+                  uint32_t len) override;
+
+  uint32_t RefinePipelined(const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                           Predicate predicate, uint32_t* build_indices,
+                           uint32_t* probe_indices, uint32_t len);
+
+ private:
+  RTSpatialRefinerConfig config_;
+  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
+  std::shared_ptr<ThreadPool> thread_pool_;
+  std::unique_ptr<ParallelWkbLoader<point_t, index_t>> wkb_loader_;
+  dev_geometries_t build_geometries_;
+
+  template <typename INDEX_IT>
+  void buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                       INDEX_IT index_end, IndicesMap& indices_map) const;
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp
new file mode 100644
index 000000000..6b6978799
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "gpuspatial/refine/spatial_refiner.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+
+#include <memory>
+
+namespace gpuspatial {
+
+struct RTSpatialRefinerConfig {
+  std::shared_ptr<RTEngine> rt_engine;
+  // Prefer fast build the BVH
+  bool prefer_fast_build = false;
+  // Compress the BVH to save memory
+  bool compact = true;
+  // Loader configurations
+  // How many threads to use for parsing WKBs
+  uint32_t parsing_threads = std::thread::hardware_concurrency();
+  // How many threads are allowed to call PushStream concurrently
+  uint32_t concurrency = 1;
+  // Overlapping parsing and refinement by pipelining multiple batches; 1 means no
+  // pipelining
+  uint32_t pipeline_batches = 1;
+  // the host memory quota for WKB parser compared to the available memory
+  float wkb_parser_memory_quota = 0.8;
+  // the device memory quota for relate engine compared to the available memory
+  float relate_engine_memory_quota = 0.8;
+  // this value determines RELATE_MAX_DEPTH
+  size_t stack_size_bytes = 3 * 1024;
+  bool sort_probe_indices = true;  // Sedona's spatial-join may require ordered output
+};
+
+std::unique_ptr<SpatialRefiner> CreateRTSpatialRefiner(
+    const RTSpatialRefinerConfig& config);
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp
new file mode 100644
index 000000000..3b979ed56
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/relate/predicate.hpp"
+
+#include "nanoarrow/nanoarrow.h"
+
+namespace gpuspatial {
+class SpatialRefiner {
+ public:
+  virtual ~SpatialRefiner() = default;
+
+  virtual void Clear() = 0;
+
+  virtual void PushBuild(const ArrowSchema* build_schema,
+                         const ArrowArray* build_array) = 0;
+
+  virtual void FinishBuilding() = 0;
+
+  virtual uint32_t Refine(const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                          Predicate predicate, uint32_t* build_indices,
+                          uint32_t* probe_indices, uint32_t len) = 0;
+
+  virtual uint32_t Refine(const ArrowSchema* build_schema, const ArrowArray* build_array,
+                          const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                          Predicate predicate, uint32_t* build_indices,
+                          uint32_t* probe_indices, uint32_t len) = 0;
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/intersection_matrix.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/predicate.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.hpp
index 4b397453c..038ce7681 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate.hpp
@@ -22,13 +22,13 @@
  */
 
 #pragma once
-#include "gpuspatial/geom/line_string.cuh"
-#include "gpuspatial/geom/multi_line_string.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/relate/intersection_matrix.cuh"
+#include "gpuspatial/geom/line_string.hpp"
+#include "gpuspatial/geom/multi_line_string.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/relate/intersection_matrix.hpp"
 // Ref: https://github.com/heterodb/pg-strom/blob/master/src/xpu_postgis.cu
 // A good visualize to cases
 // https://dev.luciad.com/portal/productDocumentation/LuciadFusion/docs/articles/guide/geometry/images/interior_exterior_boundary.png
@@ -169,8 +169,10 @@ DEV_HOST int32_t relate(const POINT_T& P1, bool p1_is_head, const POINT_T& P2,
           if (p1_in_qq != PointLocation::kOutside &&
               p2_in_qq != PointLocation::kOutside) {
             /* P1-P2 is fully contained by Q1-Q2 */
-            if (p1_is_head) retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_HEAD_CONTAINED);
-            if (p2_is_tail) retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_TAIL_CONTAINED);
+            if (p1_is_head)
+              retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_HEAD_CONTAINED);
+            if (p2_is_tail)
+              retval |= (IntersectionMatrix::BOUND_BOUND_0D | IM__LINE_TAIL_CONTAINED);
             if (P1 == P2) {
               if (!p1_is_head && !p2_is_tail)
                 retval |= IntersectionMatrix::INTER_BOUND_0D;
@@ -457,8 +459,9 @@ DEV_HOST_INLINE int32_t relate(const LinearRing<POINT_T>& ring,
         std::min(P1.x(), P2.x()) > mbr.get_max().x() ||
         std::max(P1.y(), P2.y()) < mbr.get_min().y() ||
         std::min(P1.y(), P2.y()) > mbr.get_max().y()) {
-      status = (IntersectionMatrix::INTER_EXTER_1D | IntersectionMatrix::BOUND_EXTER_0D | IntersectionMatrix::EXTER_INTER_2D |
-                IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      status = (IntersectionMatrix::INTER_EXTER_1D | IntersectionMatrix::BOUND_EXTER_0D |
+                IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+                IntersectionMatrix::EXTER_EXTER_2D);
     } else {
       status = relate(P1, false, P2, false, geom, 0, false);
       // char res[10];
@@ -497,25 +500,32 @@ DEV_HOST_INLINE int32_t relate(const LinearRing<POINT_T>& ring,
    */
   if ((rflags & IntersectionMatrix::INTER_BOUND_2D) == IntersectionMatrix::INTER_BOUND_1D)
     boundary = IntersectionMatrix::BOUND_BOUND_1D;
-  else if ((rflags & IntersectionMatrix::INTER_BOUND_2D) == IntersectionMatrix::INTER_BOUND_0D)
+  else if ((rflags & IntersectionMatrix::INTER_BOUND_2D) ==
+           IntersectionMatrix::INTER_BOUND_0D)
     boundary = IntersectionMatrix::BOUND_BOUND_0D;
 
-  if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
+  if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 &&
+      (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
       (rflags & IntersectionMatrix::INTER_EXTER_2D) == 0) {
     /* ring equals to the polygon */
-    return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) == 0 &&
+    return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_BOUND_1D |
+            IntersectionMatrix::EXTER_EXTER_2D);
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 &&
+             (rflags & IntersectionMatrix::INTER_BOUND_2D) == 0 &&
              (rflags & IntersectionMatrix::INTER_EXTER_2D) != 0) {
     if (poly_has_outside) {
       /* disjoint */
-      return (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_INTER_2D |
-              IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     } else {
       /* ring fully contains the polygons */
-      return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D | IntersectionMatrix::INTER_EXTER_2D |
-              IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D |
+              IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     }
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 &&
+             (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0
              // TODO: Need this? && (rflags & IntersectionMatrix::INTER_EXTER_2D) != 0
   ) {
     /* ring has intersection to the polygon */
@@ -523,26 +533,36 @@ DEV_HOST_INLINE int32_t relate(const LinearRing<POINT_T>& ring,
     if ((rflags & IntersectionMatrix::INTER_EXTER_2D) != 0) {
       boundary |= IntersectionMatrix::BOUND_EXTER_1D;
     }
-    return boundary | (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D | IntersectionMatrix::INTER_EXTER_2D |
-                       IntersectionMatrix::BOUND_INTER_1D | IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
-                       IntersectionMatrix::EXTER_EXTER_2D);
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 && (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
+    return boundary |
+           (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D |
+            IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_INTER_1D |
+            IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+            IntersectionMatrix::EXTER_EXTER_2D);
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) == 0 &&
+             (rflags & IntersectionMatrix::INTER_BOUND_2D) != 0 &&
              (rflags & IntersectionMatrix::INTER_EXTER_2D) != 0) {
     if (poly_has_outside) {
       /* ring touched the polygon at a boundary, but no intersection */
       assert(boundary != 0);
-      return boundary | (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_INTER_2D |
-                         IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return boundary |
+             (IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     } else {
       /* ring fully contains the polygon touched at boundaries */
       assert(boundary != 0);
-      return boundary | (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D | IntersectionMatrix::INTER_EXTER_2D |
-                         IntersectionMatrix::BOUND_EXTER_1D | IntersectionMatrix::EXTER_EXTER_2D);
+      return boundary |
+             (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::INTER_BOUND_1D |
+              IntersectionMatrix::INTER_EXTER_2D | IntersectionMatrix::BOUND_EXTER_1D |
+              IntersectionMatrix::EXTER_EXTER_2D);
     }
-  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 && (rflags & IntersectionMatrix::INTER_EXTER_2D) == 0) {
+  } else if ((rflags & IntersectionMatrix::INTER_INTER_2D) != 0 &&
+             (rflags & IntersectionMatrix::INTER_EXTER_2D) == 0) {
     /* ring is fully contained by the polygon; might be touched */
-    return boundary | (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_INTER_1D | IntersectionMatrix::EXTER_INTER_2D |
-                       IntersectionMatrix::EXTER_BOUND_1D | IntersectionMatrix::EXTER_EXTER_2D);
+    return boundary |
+           (IntersectionMatrix::INTER_INTER_2D | IntersectionMatrix::BOUND_INTER_1D |
+            IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D |
+            IntersectionMatrix::EXTER_EXTER_2D);
   }
   // FIXME:
   printf("unknown intersection\n");
@@ -663,7 +683,8 @@ DEV_HOST_INLINE int32_t relate(const POINT_T& geom1,
                                const MultiPolygon<POINT_T, INDEX_T>& geom2,
                                ArrayView<PointLocation> locations) {
   assert(geom2.num_polygons() == locations.size());
-  if (geom2.empty()) return IntersectionMatrix::INTER_EXTER_0D | IntersectionMatrix::EXTER_EXTER_2D;
+  if (geom2.empty())
+    return IntersectionMatrix::INTER_EXTER_0D | IntersectionMatrix::EXTER_EXTER_2D;
   int32_t retval = IntersectionMatrix::EXTER_EXTER_2D;
   bool matched = false;
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
similarity index 66%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
index 5fb275078..c83538a75 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
@@ -15,10 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/index/detail/rt_engine.hpp"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/relate/predicate.cuh"
-#include "gpuspatial/utils/queue.h"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/relate/predicate.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 
@@ -31,8 +30,9 @@ class RelateEngine {
  public:
   struct Config {
     bool bvh_fast_build = false;
-    bool bvh_fast_compact = true;
+    bool bvh_compact = true;
     float memory_quota = 0.8;
+    int segs_per_aabb = 32;
   };
 
   RelateEngine() = default;
@@ -40,80 +40,94 @@ class RelateEngine {
   RelateEngine(const DeviceGeometries<POINT_T, INDEX_T>* geoms1);
 
   RelateEngine(const DeviceGeometries<POINT_T, INDEX_T>* geoms1,
-               const details::RTEngine* rt_engine);
+               const RTEngine* rt_engine);
 
   void set_config(const Config& config) { config_ = config; }
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const DeviceGeometries<POINT_T, INDEX_T>& geoms2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   template <typename GEOM2_ARRAY_VIEW_T>
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   // This is a generic version that can accept any two geometry array views
   template <typename GEOM1_ARRAY_VIEW_T, typename GEOM2_ARRAY_VIEW_T>
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const GEOM1_ARRAY_VIEW_T& geom_array1,
                 const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   // These are the specific overloads for RT-accelerated PIP queries
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void EvaluateImpl(const rmm::cuda_stream_view& stream,
                     const PointArrayView<POINT_T, INDEX_T>& point_array,
                     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
                     const PolygonArrayView<POINT_T, INDEX_T>& poly_array,
-                    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids,
-                    bool inverse = false);
+                    Predicate predicate, rmm::device_uvector<INDEX_T>& point_ids,
+                    rmm::device_uvector<INDEX_T>& poly_ids, bool inverse = false);
 
   void EvaluateImpl(const rmm::cuda_stream_view& stream,
                     const PointArrayView<POINT_T, INDEX_T>& point_array,
                     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
                     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_poly_array,
-                    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids,
-                    bool inverse);
+                    Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                    rmm::device_uvector<INDEX_T>& ids2, bool inverse);
+
+  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
+                         const PolygonArrayView<POINT_T, INDEX_T>& polys,
+                         ArrayView<uint32_t> poly_ids, int segs_per_aabb);
+
+  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
+                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
+                         ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb);
 
   /**
    * Build BVH for a subset of polygons
@@ -122,34 +136,27 @@ class RelateEngine {
    * @param polygon_ids
    * @param buffer
    */
-  OptixTraversableHandle BuildBVH(const rmm::cuda_stream_view& stream,
-                                  const PolygonArrayView<POINT_T, INDEX_T>& polygons,
-                                  ArrayView<uint32_t> polygon_ids,
-                                  rmm::device_uvector<INDEX_T>& seg_begins,
-                                  rmm::device_buffer& buffer,
-                                  rmm::device_uvector<INDEX_T>& aabb_poly_ids,
-                                  rmm::device_uvector<INDEX_T>& aabb_ring_ids);
+  OptixTraversableHandle BuildBVH(
+      const rmm::cuda_stream_view& stream,
+      const PolygonArrayView<POINT_T, INDEX_T>& polygons, ArrayView<uint32_t> polygon_ids,
+      int segs_per_aabb, rmm::device_buffer& buffer,
+      rmm::device_uvector<INDEX_T>& aabb_poly_ids,
+      rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+      rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets);
 
   OptixTraversableHandle BuildBVH(
       const rmm::cuda_stream_view& stream,
       const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-      ArrayView<uint32_t> multi_poly_ids, rmm::device_uvector<INDEX_T>& seg_begins,
-      rmm::device_uvector<INDEX_T>& part_begins, rmm::device_buffer& buffer,
+      ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb, rmm::device_buffer& buffer,
       rmm::device_uvector<INDEX_T>& aabb_multi_poly_ids,
       rmm::device_uvector<INDEX_T>& aabb_part_ids,
-      rmm::device_uvector<INDEX_T>& aabb_ring_ids);
-
-  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
-                         const PolygonArrayView<POINT_T, INDEX_T>& polys,
-                         ArrayView<uint32_t> poly_ids);
-
-  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
-                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-                         ArrayView<uint32_t> multi_poly_ids);
+      rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+      rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets,
+      rmm::device_uvector<INDEX_T>& part_begins);
 
  private:
   Config config_;
   const DeviceGeometries<POINT_T, INDEX_T>* geoms1_;
-  const details::RTEngine* rt_engine_;
+  const RTEngine* rt_engine_;
 };
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.cuh
similarity index 67%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.cuh
index 555d2504c..a263fbcf2 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.cuh
@@ -16,13 +16,13 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/multi_point.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/geom/polygon.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/queue_view.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/multi_point.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/geom/polygon.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/queue_view.hpp"
 
 #include <thrust/pair.h>
 
@@ -31,29 +31,29 @@ namespace detail {
 
 template <typename POINT_T>
 struct LaunchParamsPointQuery {
-  using box_t = Box<Point<float, POINT_T::n_dim>>;
-  // Data structures of geometries1
-  bool grouped;
-  ArrayView<uint32_t> prefix_sum;         // Only used when grouped
-  ArrayView<uint32_t> reordered_indices;  // Only used when grouped
-  ArrayView<box_t> mbrs1;                 // MBR of each feature in geometries1
+  using box_t = Box<POINT_T>;
+  // Input
+  ArrayView<box_t> rects;
+  ArrayView<POINT_T> points;
   OptixTraversableHandle handle;
-  //  Data structures of geometries2
-  ArrayView<POINT_T> points2;
-  // Output: Geom1 ID, Geom2 ID
-  QueueView<thrust::pair<uint32_t, uint32_t>> ids;
+  uint32_t* count;
+  // Output
+  QueueView<uint32_t> rect_ids;
+  ArrayView<uint32_t> point_ids;
 };
 
 template <typename POINT_T>
 struct LaunchParamsBoxQuery {
-  using box_t = Box<Point<float, POINT_T::n_dim>>;
+  using box_t = Box<POINT_T>;
   // Input
-  ArrayView<box_t> mbrs1;
-  ArrayView<box_t> mbrs2;
+  ArrayView<box_t> rects1;
+  ArrayView<box_t> rects2;
   // can be either geometries 1 or 2
   OptixTraversableHandle handle;
-  // Output: Geom2 ID, Geom2 ID
-  QueueView<thrust::pair<uint32_t, uint32_t>> ids;
+  uint32_t* count;
+  // Output
+  QueueView<uint32_t> rect1_ids;
+  ArrayView<uint32_t> rect2_ids;
 };
 
 /**
@@ -67,12 +67,15 @@ struct LaunchParamsPolygonPointQuery {
   MultiPointArrayView<point_t, index_t> multi_points;
   PointArrayView<point_t, index_t> points;
   PolygonArrayView<point_t, index_t> polygons;
-  ArrayView<index_t> polygon_ids;  // sorted
-  ArrayView<thrust::pair<index_t, index_t>> ids;
+  ArrayView<index_t> uniq_polygon_ids;  // sorted
+  index_t* query_point_ids;
+  index_t* query_polygon_ids;
+  size_t query_size;
   ArrayView<index_t> seg_begins;
   ArrayView<int> IMs;  // intersection matrices
   OptixTraversableHandle handle;
   ArrayView<index_t> aabb_poly_ids, aabb_ring_ids;
+  ArrayView<thrust::pair<index_t, index_t>> aabb_vertex_offsets;
 };
 
 /**
@@ -87,14 +90,16 @@ struct LaunchParamsPointMultiPolygonQuery {
   // Either MultiPointArrayView or PointArrayView will be used
   MultiPointArrayView<point_t, index_t> multi_points;
   PointArrayView<point_t, index_t> points;
-  ArrayView<index_t> multi_polygon_ids;  // sorted
-  ArrayView<thrust::pair<index_t, index_t>> ids;
-  ArrayView<index_t> seg_begins;
-  ArrayView<index_t> uniq_part_begins;
+  ArrayView<index_t> uniq_multi_polygon_ids;  // sorted
+  index_t* query_point_ids;
+  index_t* query_multi_polygon_ids;
+  size_t query_size;
+  ArrayView<index_t> uniq_part_begins;  // used to calculate z-index for parts
   // each query point has n elements of part_min_y and part_locations, n is # of parts
   ArrayView<int> IMs;  // intersection matrices
   OptixTraversableHandle handle;
   ArrayView<index_t> aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids;
+  ArrayView<thrust::pair<index_t, index_t>> aabb_vertex_offsets;
 };
 
 }  // namespace detail
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
index d571feaa7..3b3019e46 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
@@ -16,7 +16,7 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/utils/array_view.h"
+#include "gpuspatial/utils/array_view.hpp"
 
 #include "rmm/cuda_stream.hpp"
 #include "rmm/device_uvector.hpp"
@@ -33,7 +33,6 @@
 #define GPUSPATIAL_OPTIX_LAUNCH_PARAMS_NAME "params"
 
 namespace gpuspatial {
-namespace details {
 
 /*! SBT record for a raygen program */
 struct __align__(OPTIX_SBT_RECORD_ALIGNMENT) RaygenRecord {
@@ -160,6 +159,9 @@ RTConfig get_default_rt_config(const std::string& ptx_root);
 
 class RTEngine {
  public:
+  RTEngine(const RTEngine&) = delete;
+  RTEngine& operator=(const RTEngine&) = delete;
+
   RTEngine();
   ~RTEngine();
 
@@ -201,5 +203,4 @@ class RTEngine {
   bool initialized_;
 };
 
-}  // namespace details
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.hpp
index f1d5fb487..da9339ae7 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/array_view.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <thrust/swap.h>
 namespace gpuspatial {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.hpp
similarity index 97%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.hpp
index 2f6941704..4cca08fd0 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.hpp
@@ -28,7 +28,7 @@
 
 #else
 #define DEV_HOST
-#define DEV_HOST_INLINE
+#define DEV_HOST_INLINE inline
 #define DEV_INLINE
 #define CONST_STATIC_INIT(...) = __VA_ARGS__
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.hpp
index 91c5adce8..9bf3c9267 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/doubledouble.hpp
@@ -68,7 +68,7 @@
 
 #pragma once
 
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <array>
 #include <cfloat>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.hpp
index a35005ebe..ab6f174e7 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.hpp
@@ -53,7 +53,7 @@ inline void optixCheck(OptixResult res, const char* call, const char* file,
     std::stringstream ss;
     ss << "OptiX API call (" << call << ") failed with error " << optixGetErrorName(res)
        << " (" << file << ":" << line << ")";
-    GPUSPATIAL_LOG_ERROR("Optix API error: {}", ss.str());
+    GPUSPATIAL_LOG_ERROR("Optix API error: %s", ss.str());
     throw GPUException(res, ss.str().c_str());
   }
 }
@@ -64,7 +64,7 @@ inline void cudaCheck(cudaError_t error, const char* call, const char* file,
     std::stringstream ss;
     ss << "CUDA API call (" << call << ") failed with error " << cudaGetErrorString(error)
        << " (" << file << ":" << line << ")";
-    GPUSPATIAL_LOG_ERROR("CUDA API error: {}", ss.str());
+    GPUSPATIAL_LOG_ERROR("CUDA API error: %s", ss.str());
     throw GPUException(ss.str().c_str());
   }
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.hpp
index 9014a552b..6512fe40c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/floating_point.hpp
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <cmath>
 #include <cstdint>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
index 33c8d47bc..1cec9359f 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/gpu_timer.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/exception.hpp"
 
 #include <cuda_runtime.h>
 namespace gpuspatial {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.cuh
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.cuh
index 5fc1d54ff..99c02b38c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/helpers.cuh
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <cassert>
 #include <climits>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.hpp
similarity index 94%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.hpp
index 09c2c8aed..31c0b6a7d 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/launcher.hpp
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/exception.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.hpp
new file mode 100644
index 000000000..6cc62edb5
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.hpp
@@ -0,0 +1,145 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include <cstdint>
+#define DISABLE_NVTX_MARKERS
+
+#ifndef DISABLE_NVTX_MARKERS
+#include <nvtx3/nvtx3.hpp>
+#endif
+
+namespace gpuspatial {
+
+struct Category {
+  static constexpr uint32_t KernelWorkitems = 1;
+  static constexpr uint32_t IntervalWorkitems = 2;
+};
+
+// Colors in ARGB format (Alpha, Red, Green, Blue)
+struct Color {
+  static constexpr uint32_t Red = 0xFF880000;
+  static constexpr uint32_t Green = 0xFF008800;
+  static constexpr uint32_t Blue = 0xFF000088;
+  static constexpr uint32_t Yellow = 0xFFFFFF00;
+  static constexpr uint32_t Default = 0;
+};
+
+#ifndef DISABLE_NVTX_MARKERS
+
+struct Instrument {
+  // ---------------------------------------------------------------------------
+  // Helper: Create attributes correctly using constructors
+  // ---------------------------------------------------------------------------
+  static nvtx3::event_attributes create_attr(const char* msg, uint32_t color_val,
+                                             uint32_t category_val) {
+    // 1. Basic Message
+    nvtx3::event_attributes attr{msg};
+
+    // 2. Apply Color (if not default)
+    if (color_val != Color::Default) {
+      // Use nvtx3::rgb wrapping the uint32_t directly usually works,
+      // but if it fails, we assign to the internal color_type directly via the generic
+      // color wrapper
+      attr = nvtx3::event_attributes{msg, nvtx3::color{color_val}};
+    }
+
+    // 3. Apply Category (if valid)
+    // Note: We cannot "append" to an existing immutable object.
+    // We must construct with all arguments at once.
+
+    if (color_val != Color::Default && category_val != 0) {
+      return nvtx3::event_attributes{msg, nvtx3::color{color_val},
+                                     nvtx3::category{category_val}};
+    } else if (color_val != Color::Default) {
+      return nvtx3::event_attributes{msg, nvtx3::color{color_val}};
+    } else if (category_val != 0) {
+      return nvtx3::event_attributes{msg, nvtx3::category{category_val}};
+    }
+
+    return attr;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Instant Markers
+  // ---------------------------------------------------------------------------
+  static void Mark(const char* message, uint32_t color = Color::Default,
+                   uint32_t category = 0) {
+    nvtx3::mark(create_attr(message, color, category));
+  }
+
+  static void MarkInt(int64_t value, const char* message, uint32_t color = Color::Default,
+                      uint32_t category = 0) {
+    // Construct with payload immediately
+    // Note: If you need color+category+payload, the constructor list gets long.
+    // This covers the most common case: Message + Payload
+    if (color == Color::Default && category == 0) {
+      nvtx3::event_attributes attr{message, nvtx3::payload{value}};
+      nvtx3::mark(attr);
+    } else {
+      // Fallback: manually construct complex attribute
+      // Most NVTX3 versions support {msg, color, payload, category} in any order
+      nvtx3::event_attributes attr{message, nvtx3::color{color},
+                                   nvtx3::category{category}, nvtx3::payload{value}};
+      nvtx3::mark(attr);
+    }
+  }
+
+  static void MarkWorkitems(uint64_t items, const char* message = "Workitems") {
+    nvtx3::event_attributes attr{message, nvtx3::payload{items},
+                                 nvtx3::category{Category::KernelWorkitems}};
+    nvtx3::mark(attr);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Scoped Ranges (RAII)
+  // ---------------------------------------------------------------------------
+  struct Range {
+    nvtx3::scoped_range range;
+
+    // Standard Range
+    explicit Range(const char* message, uint32_t color = Color::Default,
+                   uint32_t category = 0)
+        : range(Instrument::create_attr(message, color, category)) {}
+
+    // Payload Range (for workitems/intervals)
+    explicit Range(const char* message, uint64_t payload,
+                   uint32_t category = Category::IntervalWorkitems)
+        : range(nvtx3::event_attributes{message, nvtx3::payload{payload},
+                                        nvtx3::category{category}}) {}
+  };
+};
+
+#else
+
+// -----------------------------------------------------------------------------
+// No-Op Implementation
+// -----------------------------------------------------------------------------
+struct Instrument {
+  static inline void Mark(const char*, uint32_t = 0, uint32_t = 0) {}
+  static inline void MarkInt(int64_t, const char*, uint32_t = 0, uint32_t = 0) {}
+  static inline void MarkWorkitems(uint64_t, const char*) {}
+
+  struct Range {
+    explicit Range(const char*, uint32_t = 0, uint32_t = 0) {}
+    explicit Range(const char*, uint64_t, uint32_t = 0) {}
+  };
+};
+
+#endif  // DISABLE_NVTX_MARKERS
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp
index 1b36c934f..779387676 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/mem_utils.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/exception.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.hpp
index ded74f02b..0867ed007 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/morton_code.hpp
@@ -19,7 +19,7 @@
  */
 
 #pragma once
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <vector_types.h>
 #include <cuda/std/cmath>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.hpp
similarity index 99%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.hpp
index 73ac54d01..2c21ea5e4 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/pinned_vector.hpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/utils/exception.hpp"
 
 #include <cuda_runtime.h>  // For CUDA memory management functions
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.hpp
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.hpp
index 29beac229..c1921dca3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.hpp
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/queue_view.h"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/queue_view.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_scalar.hpp"
@@ -41,6 +41,7 @@ class Queue {
     if (counter_ == nullptr) {
       counter_ = std::make_unique<rmm::device_scalar<SIZE_T>>(stream);
     }
+    Clear(stream);
   }
 
   void Clear(const rmm::cuda_stream_view& stream) {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.hpp
similarity index 96%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.hpp
index e4b10ef9d..f907bff57 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue_view.hpp
@@ -16,8 +16,8 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <cooperative_groups.h>
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/stopwatch.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/thread_pool.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/type_traits.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc b/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
index 58ef354ab..e8494f3e2 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
@@ -14,157 +14,342 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+
 #include "gpuspatial/gpuspatial_c.h"
-#include "gpuspatial/index/spatial_joiner.hpp"
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/mem/memory_manager.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/exception.hpp"
 
 #include <threads.h>
+#include <algorithm>
+#include <cstring>
 #include <memory>
-#define GPUSPATIAL_ERROR_MSG_BUFFER_SIZE (1024)
 
-struct GpuSpatialJoinerExporter {
-  static void Export(std::unique_ptr<gpuspatial::StreamingJoiner>& idx,
-                     struct GpuSpatialJoiner* out) {
-    out->private_data = idx.release();
-    out->init = &CInit;
+// -----------------------------------------------------------------------------
+// INTERNAL HELPERS
+// -----------------------------------------------------------------------------
+// This is what the private_data points to for the public C interfaces
+template <typename T>
+struct GpuSpatialWrapper {
+  T payload;
+  std::string last_error;  // Pointer to std::string to store last error message
+};
+
+// The unified error handling wrapper
+// Func: The lambda containing the logic
+template <typename T, typename Func>
+int SafeExecute(GpuSpatialWrapper<T>* wrapper, Func&& func) {
+  try {
+    func();
+    wrapper->last_error.clear();
+    return 0;
+  } catch (const std::exception& e) {
+    wrapper->last_error = std::string(e.what());
+    return EINVAL;
+  } catch (...) {
+    wrapper->last_error = "Unknown internal error";
+    return EINVAL;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// IMPLEMENTATION
+// -----------------------------------------------------------------------------
+
+struct GpuSpatialRuntimeExporter {
+  struct Payload {
+    std::shared_ptr<gpuspatial::RTEngine> rt_engine;
+    int device_id;
+  };
+
+  using private_data_t = GpuSpatialWrapper<Payload>;
+  static void Export(struct GpuSpatialRuntime* out) {
+    private_data_t* private_data =
+        new private_data_t{Payload{std::make_shared<gpuspatial::RTEngine>()}, ""};
+    out->init = CInit;
+    out->release = CRelease;
+    out->get_last_error = CGetLastError;
+    out->private_data = private_data;
+  }
+
+  static int CInit(GpuSpatialRuntime* self, GpuSpatialRuntimeConfig* config) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      std::string ptx_root(config->ptx_root);
+      auto rt_config = gpuspatial::get_default_rt_config(ptx_root);
+
+      GPUSPATIAL_LOG_INFO("Initializing GpuSpatialRuntime on device %d, PTX root %s",
+                          config->device_id, config->ptx_root);
+
+      CUDA_CHECK(cudaSetDevice(config->device_id));
+
+      gpuspatial::MemoryManager::instance().Init(config->use_cuda_memory_pool,
+                                                 config->cuda_memory_pool_init_precent);
+
+      static_cast<private_data_t*>(self->private_data)
+          ->payload.rt_engine->Init(rt_config);
+    });
+  }
+
+  static void CRelease(GpuSpatialRuntime* self) {
+    gpuspatial::MemoryManager::instance().Shutdown();
+    delete static_cast<private_data_t*>(self->private_data);
+    self->private_data = nullptr;
+  }
+
+  static const char* CGetLastError(GpuSpatialRuntime* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+};
+
+void GpuSpatialRuntimeCreate(struct GpuSpatialRuntime* runtime) {
+  GpuSpatialRuntimeExporter::Export(runtime);
+}
+
+using runtime_data_t = GpuSpatialRuntimeExporter::private_data_t;
+
+struct GpuSpatialIndexFloat2DExporter {
+  using scalar_t = float;
+  static constexpr int n_dim = 2;
+  using self_t = SedonaFloatIndex2D;
+  using spatial_index_t = gpuspatial::SpatialIndex<scalar_t, n_dim>;
+
+  struct Payload {
+    std::unique_ptr<spatial_index_t> index;
+    runtime_data_t* rdata;
+  };
+
+  struct ResultBuffer {
+    std::vector<uint32_t> build_indices;
+    std::vector<uint32_t> probe_indices;
+    ResultBuffer() = default;
+
+    ResultBuffer(const ResultBuffer&) = delete;
+    ResultBuffer& operator=(const ResultBuffer&) = delete;
+
+    ResultBuffer(ResultBuffer&&) = default;
+    ResultBuffer& operator=(ResultBuffer&&) = default;
+  };
+
+  using private_data_t = GpuSpatialWrapper<Payload>;
+  using context_t = GpuSpatialWrapper<ResultBuffer>;
+
+  static void Export(const struct GpuSpatialIndexConfig* config,
+                     struct SedonaFloatIndex2D* out) {
+    auto* rdata = static_cast<runtime_data_t*>(config->runtime->private_data);
+
+    gpuspatial::RTSpatialIndexConfig index_config;
+
+    index_config.rt_engine = rdata->payload.rt_engine;
+    index_config.concurrency = config->concurrency;
+
+    // Create SpatialIndex may involve GPU operations, set device here
+    CUDA_CHECK(cudaSetDevice(rdata->payload.device_id));
+
+    auto uniq_index = gpuspatial::CreateRTSpatialIndex<float, 2>(index_config);
+
     out->clear = &CClear;
-    out->push_build = &CPushBuild;
-    out->finish_building = &CFinishBuilding;
     out->create_context = &CCreateContext;
     out->destroy_context = &CDestroyContext;
-    out->push_stream = &CPushStream;
+    out->push_build = &CPushBuild;
+    out->finish_building = &CFinishBuilding;
+    out->probe = &CProbe;
     out->get_build_indices_buffer = &CGetBuildIndicesBuffer;
-    out->get_stream_indices_buffer = &CGetStreamIndicesBuffer;
+    out->get_probe_indices_buffer = &CGetProbeIndicesBuffer;
+    out->get_last_error = &CGetLastError;
+    out->context_get_last_error = &CContextGetLastError;
     out->release = &CRelease;
-    out->last_error = new char[GPUSPATIAL_ERROR_MSG_BUFFER_SIZE];
-  }
-
-  static int CInit(struct GpuSpatialJoiner* self, struct GpuSpatialJoinerConfig* config) {
-    int err = 0;
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    try {
-      gpuspatial::InitSpatialJoiner(joiner, config->ptx_root, config->concurrency);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static void CCreateContext(struct GpuSpatialJoiner* self,
-                             struct GpuSpatialJoinerContext* context) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    context->private_data = new std::shared_ptr(joiner->CreateContext());
-    context->last_error = new char[GPUSPATIAL_ERROR_MSG_BUFFER_SIZE];
-    context->build_indices = new std::vector<uint32_t>();
-    context->stream_indices = new std::vector<uint32_t>();
-  }
-
-  static void CDestroyContext(struct GpuSpatialJoinerContext* context) {
-    delete (std::shared_ptr<gpuspatial::StreamingJoiner::Context>*)context->private_data;
-    delete[] context->last_error;
-    delete (std::vector<uint32_t>*)context->build_indices;
-    delete (std::vector<uint32_t>*)context->stream_indices;
+    out->private_data = new private_data_t{Payload{std::move(uniq_index), rdata}, ""};
+  }
+
+  static void CCreateContext(struct SedonaSpatialIndexContext* context) {
+    context->private_data = new context_t();
+  }
+
+  static void CDestroyContext(struct SedonaSpatialIndexContext* context) {
+    delete static_cast<context_t*>(context->private_data);
     context->private_data = nullptr;
-    context->last_error = nullptr;
-    context->build_indices = nullptr;
-    context->stream_indices = nullptr;
-  }
-
-  static void CClear(struct GpuSpatialJoiner* self) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    joiner->Clear();
-  }
-
-  static int CPushBuild(struct GpuSpatialJoiner* self, const struct ArrowSchema* schema,
-                        const struct ArrowArray* array, int64_t offset, int64_t length) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    int err = 0;
-    try {
-      joiner->PushBuild(schema, array, offset, length);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static int CFinishBuilding(struct GpuSpatialJoiner* self) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    int err = 0;
-    try {
-      joiner->FinishBuilding();
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static int CPushStream(struct GpuSpatialJoiner* self,
-                         struct GpuSpatialJoinerContext* context,
-                         const struct ArrowSchema* schema, const struct ArrowArray* array,
-                         int64_t offset, int64_t length,
-                         enum GpuSpatialPredicate predicate, int32_t array_index_offset) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    auto* private_data =
-        (std::shared_ptr<gpuspatial::StreamingJoiner::Context>*)context->private_data;
-    int err = 0;
-    try {
-      joiner->PushStream(private_data->get(), schema, array, offset, length,
-                         static_cast<gpuspatial::Predicate>(predicate),
-                         static_cast<std::vector<uint32_t>*>(context->build_indices),
-                         static_cast<std::vector<uint32_t>*>(context->stream_indices),
-                         array_index_offset);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      strncpy((char*)context->last_error, e.what(), len);
-      ((char*)context->last_error)[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static void CGetBuildIndicesBuffer(struct GpuSpatialJoinerContext* context,
-                                     void** build_indices,
+  }
+
+  static int CClear(self_t* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [=] { use_index(self).Clear(); });
+  }
+
+  static int CPushBuild(self_t* self, const float* buf, uint32_t n_rects) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      auto* rects = reinterpret_cast<const spatial_index_t::box_t*>(buf);
+      use_index(self).PushBuild(rects, n_rects);
+    });
+  }
+
+  static int CFinishBuilding(self_t* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_index(self).FinishBuilding(); });
+  }
+
+  static int CProbe(self_t* self, SedonaSpatialIndexContext* context, const float* buf,
+                    uint32_t n_rects) {
+    return SafeExecute(static_cast<context_t*>(context->private_data), [&] {
+      auto* rects = reinterpret_cast<const spatial_index_t::box_t*>(buf);
+      auto& buff = static_cast<context_t*>(context->private_data)->payload;
+      use_index(self).Probe(rects, n_rects, &buff.build_indices, &buff.probe_indices);
+    });
+  }
+
+  static void CGetBuildIndicesBuffer(struct SedonaSpatialIndexContext* context,
+                                     uint32_t** build_indices,
                                      uint32_t* build_indices_length) {
-    auto* vec = static_cast<std::vector<uint32_t>*>(context->build_indices);
+    auto* ctx = static_cast<context_t*>(context->private_data);
+    *build_indices = ctx->payload.build_indices.data();
+    *build_indices_length = ctx->payload.build_indices.size();
+  }
 
-    *build_indices = vec->data();
-    *build_indices_length = vec->size();
+  static void CGetProbeIndicesBuffer(struct SedonaSpatialIndexContext* context,
+                                     uint32_t** probe_indices,
+                                     uint32_t* probe_indices_length) {
+    auto* ctx = static_cast<context_t*>(context->private_data);
+    *probe_indices = ctx->payload.probe_indices.data();
+    *probe_indices_length = ctx->payload.probe_indices.size();
   }
 
-  static void CGetStreamIndicesBuffer(struct GpuSpatialJoinerContext* context,
-                                      void** stream_indices,
-                                      uint32_t* stream_indices_length) {
-    auto* vec = static_cast<std::vector<uint32_t>*>(context->stream_indices);
+  static const char* CGetLastError(self_t* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
 
-    *stream_indices = vec->data();
-    *stream_indices_length = vec->size();
+  static const char* CContextGetLastError(SedonaSpatialIndexContext* self) {
+    auto* private_data = static_cast<context_t*>(self->private_data);
+    return private_data->last_error.c_str();
   }
 
-  static void CRelease(struct GpuSpatialJoiner* self) {
-    delete[] self->last_error;
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    delete joiner;
+  static void CRelease(self_t* self) {
+    delete static_cast<private_data_t*>(self->private_data);
     self->private_data = nullptr;
-    self->last_error = nullptr;
+  }
+
+  static spatial_index_t& use_index(self_t* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    auto* r_data = private_data->payload.rdata;
+
+    CUDA_CHECK(cudaSetDevice(r_data->payload.device_id));
+    return *(private_data->payload.index);
   }
 };
 
-void GpuSpatialJoinerCreate(struct GpuSpatialJoiner* joiner) {
-  auto idx = gpuspatial::CreateSpatialJoiner();
-  GpuSpatialJoinerExporter::Export(idx, joiner);
+int GpuSpatialIndexFloat2DCreate(struct SedonaFloatIndex2D* index,
+                                 const struct GpuSpatialIndexConfig* config) {
+  try {
+    GpuSpatialIndexFloat2DExporter::Export(config, index);
+  } catch (std::exception& e) {
+    GPUSPATIAL_LOG_ERROR("Failed to create GpuSpatialIndexFloat2D: %s", e.what());
+    return EINVAL;
+  }
+  return 0;
+}
+
+struct GpuSpatialRefinerExporter {
+  struct Payload {
+    std::unique_ptr<gpuspatial::SpatialRefiner> refiner;
+    runtime_data_t* rdata;
+  };
+  using private_data_t = GpuSpatialWrapper<Payload>;
+
+  static void Export(const GpuSpatialRefinerConfig* config,
+                     struct SedonaSpatialRefiner* out) {
+    auto* rdata = static_cast<runtime_data_t*>(config->runtime->private_data);
+
+    gpuspatial::RTSpatialRefinerConfig refiner_config;
+
+    refiner_config.rt_engine = rdata->payload.rt_engine;
+    refiner_config.concurrency = config->concurrency;
+    refiner_config.compact = config->compress_bvh;
+    refiner_config.pipeline_batches = config->pipeline_batches;
+
+    // Create Refinner may involve GPU operations, set device here
+    CUDA_CHECK(cudaSetDevice(rdata->payload.device_id));
+
+    auto refiner = gpuspatial::CreateRTSpatialRefiner(refiner_config);
+
+    out->clear = &CClear;
+    out->push_build = &CPushBuild;
+    out->finish_building = &CFinishBuilding;
+    out->refine_loaded = &CRefineLoaded;
+    out->refine = &CRefine;
+    out->get_last_error = &CGetLastError;
+    out->release = &CRelease;
+    out->private_data = new private_data_t{Payload{std::move(refiner), rdata}, ""};
+  }
+
+  static int CClear(SedonaSpatialRefiner* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).Clear(); });
+  }
+
+  static int CPushBuild(SedonaSpatialRefiner* self, const ArrowSchema* build_schema,
+                        const ArrowArray* build_array) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).PushBuild(build_schema, build_array); });
+  }
+
+  static int CFinishBuilding(SedonaSpatialRefiner* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).FinishBuilding(); });
+  }
+
+  static int CRefineLoaded(SedonaSpatialRefiner* self, const ArrowSchema* probe_schema,
+                           const ArrowArray* probe_array,
+                           SedonaSpatialRelationPredicate predicate,
+                           uint32_t* build_indices, uint32_t* probe_indices,
+                           uint32_t indices_size, uint32_t* new_indices_size) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      *new_indices_size = use_refiner(self).Refine(
+          probe_schema, probe_array, static_cast<gpuspatial::Predicate>(predicate),
+          build_indices, probe_indices, indices_size);
+    });
+  }
+
+  static int CRefine(SedonaSpatialRefiner* self, const ArrowSchema* schema1,
+                     const ArrowArray* array1, const ArrowSchema* schema2,
+                     const ArrowArray* array2, SedonaSpatialRelationPredicate predicate,
+                     uint32_t* indices1, uint32_t* indices2, uint32_t indices_size,
+                     uint32_t* new_indices_size) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      *new_indices_size = use_refiner(self).Refine(
+          schema1, array1, schema2, array2, static_cast<gpuspatial::Predicate>(predicate),
+          indices1, indices2, indices_size);
+    });
+  }
+
+  static const char* CGetLastError(SedonaSpatialRefiner* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+
+  static void CRelease(SedonaSpatialRefiner* self) {
+    delete static_cast<private_data_t*>(self->private_data);
+    self->private_data = nullptr;
+  }
+
+  static gpuspatial::SpatialRefiner& use_refiner(SedonaSpatialRefiner* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    auto* r_data = private_data->payload.rdata;
+
+    CUDA_CHECK(cudaSetDevice(r_data->payload.device_id));
+    return *(private_data->payload.refiner);
+  }
+};
+
+int GpuSpatialRefinerCreate(SedonaSpatialRefiner* refiner,
+                            const GpuSpatialRefinerConfig* config) {
+  try {
+    GpuSpatialRefinerExporter::Export(config, refiner);
+  } catch (std::exception& e) {
+    GPUSPATIAL_LOG_ERROR("Failed to create GpuSpatialRefiner: %s", e.what());
+    return EINVAL;
+  }
+  return 0;
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
new file mode 100644
index 000000000..fdf66e700
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/memory_manager.cc
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gpuspatial/mem/memory_manager.hpp"
+#include "gpuspatial/utils/logger.hpp"
+
+#if defined(_WIN32)
+#include <windows.h>
+#elif defined(__linux__)
+#include <sys/sysinfo.h>
+#else  // POSIX (BSD, Solaris, etc.)
+#include <unistd.h>
+#endif
+namespace gpuspatial {
+namespace detail {
+inline long long get_free_physical_memory() {
+#if defined(_WIN32)
+  // --- Windows ---
+  MEMORYSTATUSEX status;
+  status.dwLength = sizeof(status);
+  if (GlobalMemoryStatusEx(&status)) {
+    return (long long)status.ullAvailPhys;
+  }
+  return 0;
+
+#elif defined(__linux__)
+  // --- Linux (sysinfo) ---
+  struct sysinfo info;
+  if (sysinfo(&info) == 0) {
+    return (long long)info.freeram * (long long)info.mem_unit;
+  }
+  return 0;
+
+#else
+  // --- Generic POSIX ---
+  // _SC_AVPHYS_PAGES: The number of physical memory pages not currently in use.
+  long pages = sysconf(_SC_AVPHYS_PAGES);
+  long page_size = sysconf(_SC_PAGESIZE);
+
+  if (pages > 0 && page_size > 0) {
+    return (long long)pages * (long long)page_size;
+  }
+  return 0;
+#endif
+}
+}  // namespace detail
+
+MemoryManager& MemoryManager::instance() {
+  static MemoryManager instance;
+  return instance;
+}
+
+MemoryManager::~MemoryManager() { Shutdown(); }
+
+void MemoryManager::Shutdown() {
+  if (is_initialized_) {
+    rmm::mr::set_current_device_resource(nullptr);
+    active_resource_.reset();
+    pool_mr_.reset();
+    cuda_mr_.reset();
+    raw_tracker_ptr_ = nullptr;
+    is_initialized_ = false;
+  }
+}
+
+void MemoryManager::Init(bool use_pool, int init_pool_precent) {
+  if (is_initialized_) {
+    GPUSPATIAL_LOG_WARN(
+        "MemoryManager is already initialized. Skipping re-initialization.");
+    return;
+  }
+
+  cuda_mr_ = std::make_unique<CudaMR>();
+  use_pool_ = use_pool;
+
+  if (use_pool_) {
+    auto safe_precent = std::max(0, std::min(init_pool_precent, 100));
+    auto pool_bytes = rmm::percent_of_free_device_memory(safe_precent);
+
+    GPUSPATIAL_LOG_INFO("Creating RMM pool memory resource with size %zu MB",
+                        pool_bytes / 1024 / 1024);
+
+    pool_mr_ = std::make_unique<PoolMR>(cuda_mr_.get(), pool_bytes);
+    active_resource_ = std::make_unique<PoolTracker>(pool_mr_.get());
+  } else {
+    active_resource_ = std::make_unique<CudaTracker>(cuda_mr_.get());
+  }
+
+  raw_tracker_ptr_ = active_resource_.get();
+
+  rmm::mr::set_current_device_resource(active_resource_.get());
+  is_initialized_ = true;
+}
+
+size_t MemoryManager::get_available_device_memory() const {
+  auto avail_bytes = rmm::available_device_memory().first;
+  if (!is_initialized_ || !use_pool_) {
+    return avail_bytes;
+  }
+
+  // --- POOL STRATEGY ---
+  auto* tracker = static_cast<PoolTracker*>(raw_tracker_ptr_);
+  size_t used = tracker->get_allocated_bytes();
+
+  // Safety Buffer: 5% of TOTAL capacity (not just pool capacity)
+  size_t safe_limit = static_cast<size_t>(avail_bytes * 0.95);
+
+  return (used < safe_limit) ? (safe_limit - used) : 0;
+}
+
+size_t MemoryManager::get_available_host_memory() {
+  return detail::get_free_physical_memory();
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
index da978012c..db081da22 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
@@ -14,19 +14,21 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/index/geometry_grouper.hpp"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/relate/predicate.cuh"
-#include "gpuspatial/relate/relate.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/helpers.h"
-#include "gpuspatial/utils/launcher.h"
+#include "gpuspatial/mem/memory_manager.hpp"
+#include "gpuspatial/relate/predicate.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/helpers.cuh"
+#include "gpuspatial/utils/launcher.hpp"
 #include "gpuspatial/utils/logger.hpp"
-#include "gpuspatial/utils/queue.h"
 #include "rt/shaders/shader_id.hpp"
 
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/device/tracking_resource_adaptor.hpp>
 #include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_scalar.hpp"
 #include "rmm/exec_policy.hpp"
 
 #include <thrust/remove.h>
@@ -93,6 +95,92 @@ DEV_HOST_INLINE bool EvaluatePredicate(Predicate p, int32_t im) {
   }
   return false;
 }
+
+template <typename POINT_T, typename INDEX_T>
+uint32_t ComputeNumAabbs(const rmm::cuda_stream_view& stream,
+                         const PolygonArrayView<POINT_T, INDEX_T>& polygons,
+                         ArrayView<uint32_t> polygon_ids, int segs_per_aabb) {
+  auto n_polygons = polygon_ids.size();
+
+  rmm::device_uvector<uint32_t> n_aabbs(n_polygons, stream);
+  auto* p_n_aabbs = n_aabbs.data();
+
+  LaunchKernel(stream, [=] __device__() {
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    auto lane = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
+      auto id = polygon_ids[i];
+      const auto& polygon = polygons[id];
+      uint32_t total_segs = 0;
+
+      for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
+        total_segs +=
+            (polygon.get_ring(ring).num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+      }
+      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
+      if (lane == 0) {
+        p_n_aabbs[i] = total_segs;
+      }
+    }
+  });
+  return thrust::reduce(rmm::exec_policy_nosync(stream), n_aabbs.begin(), n_aabbs.end());
+}
+
+template <typename POINT_T, typename INDEX_T>
+uint32_t ComputeNumAabbs(const rmm::cuda_stream_view& stream,
+                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polygons,
+                         ArrayView<uint32_t> multi_polygon_ids, int segs_per_aabb) {
+  auto n_multi_polygons = multi_polygon_ids.size();
+  rmm::device_uvector<uint32_t> n_aabbs(n_multi_polygons, stream);
+  auto* p_n_aabbs = n_aabbs.data();
+
+  LaunchKernel(stream, [=] __device__() {
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    auto lane = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (auto i = global_warp_id; i < n_multi_polygons; i += n_warps) {
+      auto id = multi_polygon_ids[i];
+      const auto& multi_polygon = multi_polygons[id];
+
+      uint32_t multipoly_aabb_count = 0;
+
+      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
+        auto polygon = multi_polygon.get_polygon(part_idx);
+
+        // Local accumulator for this thread
+        uint32_t thread_aabb_count = 0;
+
+        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
+          auto n_segs = polygon.get_ring(ring).num_segments();
+
+          thread_aabb_count += (n_segs + segs_per_aabb - 1) / segs_per_aabb;
+        }
+
+        // Reduce across the warp to get total AABBs for this polygon (part)
+        uint32_t part_total = WarpReduce(temp_storage[warp_id]).Sum(thread_aabb_count);
+
+        // Add this part's total to the multi-polygon accumulator
+        if (lane == 0) {
+          multipoly_aabb_count += part_total;
+        }
+      }
+
+      if (lane == 0) {
+        p_n_aabbs[i] = multipoly_aabb_count;
+      }
+    }
+  });
+  return thrust::reduce(rmm::exec_policy_nosync(stream), n_aabbs.begin(), n_aabbs.end());
+}
 }  // namespace detail
 
 template <typename POINT_T, typename INDEX_T>
@@ -102,48 +190,49 @@ RelateEngine<POINT_T, INDEX_T>::RelateEngine(
 
 template <typename POINT_T, typename INDEX_T>
 RelateEngine<POINT_T, INDEX_T>::RelateEngine(
-    const DeviceGeometries<POINT_T, INDEX_T>* geoms1, const details::RTEngine* rt_engine)
+    const DeviceGeometries<POINT_T, INDEX_T>* geoms1, const RTEngine* rt_engine)
     : geoms1_(geoms1), rt_engine_(rt_engine) {}
 
 template <typename POINT_T, typename INDEX_T>
 void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream, const DeviceGeometries<POINT_T, INDEX_T>& geoms2,
-    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+    rmm::device_uvector<INDEX_T>& ids2) {
   switch (geoms2.get_geometry_type()) {
     case GeometryType::kPoint: {
       using geom2_array_view_t = PointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPoint: {
       using geom2_array_view_t = MultiPointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kLineString: {
       using geom2_array_view_t = LineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiLineString: {
       using geom2_array_view_t = MultiLineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kPolygon: {
       using geom2_array_view_t = PolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPolygon: {
       using geom2_array_view_t = MultiPolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     default:
@@ -153,44 +242,46 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
 
 template <typename POINT_T, typename INDEX_T>
 template <typename GEOM2_ARRAY_VIEW_T>
-void RelateEngine<POINT_T, INDEX_T>::Evaluate(
-    const rmm::cuda_stream_view& stream, const GEOM2_ARRAY_VIEW_T& geom_array2,
-    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+void RelateEngine<POINT_T, INDEX_T>::Evaluate(const rmm::cuda_stream_view& stream,
+                                              const GEOM2_ARRAY_VIEW_T& geom_array2,
+                                              Predicate predicate,
+                                              rmm::device_uvector<INDEX_T>& ids1,
+                                              rmm::device_uvector<INDEX_T>& ids2) {
   switch (geoms1_->get_geometry_type()) {
     case GeometryType::kPoint: {
       using geom1_array_view_t = PointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPoint: {
       using geom1_array_view_t = MultiPointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kLineString: {
       using geom1_array_view_t = LineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiLineString: {
       using geom1_array_view_t = MultiLineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kPolygon: {
       using geom1_array_view_t = PolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPolygon: {
       using geom1_array_view_t = MultiPolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     default:
@@ -200,11 +291,14 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
 
 template <typename POINT_T, typename INDEX_T>
 template <typename GEOM1_ARRAY_VIEW_T, typename GEOM2_ARRAY_VIEW_T>
-void RelateEngine<POINT_T, INDEX_T>::Evaluate(
-    const rmm::cuda_stream_view& stream, const GEOM1_ARRAY_VIEW_T& geom_array1,
-    const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  size_t ids_size = ids.size(stream);
+void RelateEngine<POINT_T, INDEX_T>::Evaluate(const rmm::cuda_stream_view& stream,
+                                              const GEOM1_ARRAY_VIEW_T& geom_array1,
+                                              const GEOM2_ARRAY_VIEW_T& geom_array2,
+                                              Predicate predicate,
+                                              rmm::device_uvector<INDEX_T>& ids1,
+                                              rmm::device_uvector<INDEX_T>& ids2) {
+  assert(ids1.size() == ids2.size());
+  size_t ids_size = ids1.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with generic kernel, geom1 %zu, geom2 %zu, predicate %s, result size %zu",
       geom_array1.size(), geom_array2.size(), PredicateToString(predicate), ids_size);
@@ -219,20 +313,24 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     GPUSPATIAL_LOG_WARN(
         "Evaluate Polygon-Polygon relate with the GPU, which is not well-tested and the performance may be poor.");
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        auto geom1_id = pair.first;
-        auto geom2_id = pair.second;
-        const auto& geom1 = geom_array1[geom1_id];
-        const auto& geom2 = geom_array2[geom2_id];
-
-        auto IM = relate(geom1, geom2);
-        return !detail::EvaluatePredicate(predicate, IM);
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto zip_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(ids1.begin(), ids2.begin()));
+  auto zip_end = thrust::make_zip_iterator(thrust::make_tuple(ids1.end(), ids2.end()));
+
+  auto end =
+      thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tuple) {
+                          auto geom1_id = thrust::get<0>(tuple);
+                          auto geom2_id = thrust::get<1>(tuple);
+                          const auto& geom1 = geom_array1[geom1_id];
+                          const auto& geom2 = geom_array2[geom2_id];
+
+                          auto IM = relate(geom1, geom2);
+                          return !detail::EvaluatePredicate(predicate, IM);
+                        });
+  size_t new_size = thrust::distance(zip_begin, end);
+  ids1.resize(new_size, stream);
+  ids2.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -240,9 +338,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PointArrayView<POINT_T, INDEX_T>& geom_array1,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array1, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -250,9 +348,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array1, geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -260,19 +358,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array2, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -280,19 +368,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array2, geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -300,9 +378,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PointArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array1, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -310,9 +388,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array1, geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -320,19 +398,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array2, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -340,19 +408,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array2, geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -361,10 +419,15 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     const PointArrayView<POINT_T, INDEX_T>& point_array,
     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
     const PolygonArrayView<POINT_T, INDEX_T>& poly_array, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids, bool inverse) {
+    rmm::device_uvector<INDEX_T>& point_ids, rmm::device_uvector<INDEX_T>& poly_ids,
+    bool inverse) {
+  // Casting short rays from each point to do precise point-in-polygon test
+  // Reference: "Geng L, Lee R, Zhang X. Rayjoin: Fast and precise spatial join.
+  // InProceedings of the 38th ACM International Conference on Supercomputing 2024"
   using params_t = detail::LaunchParamsPolygonPointQuery<POINT_T, INDEX_T>;
-
-  size_t ids_size = ids.size(stream);
+  assert(point_array.empty() || multi_point_array.empty());
+  assert(point_ids.size() == poly_ids.size());
+  size_t ids_size = point_ids.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with ray-tracing, (multi-)point %zu, polygon %zu, predicate %s, result size %zu, inverse %d",
       !point_array.empty() ? point_array.size() : multi_point_array.size(),
@@ -373,79 +436,88 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
   if (ids_size == 0) {
     return;
   }
-  // pair.first is point id; pair.second is polygon id
-  // Sort by multi polygon id
-  thrust::sort(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-               [] __device__(const thrust::pair<uint32_t, uint32_t>& pair1,
-                             const thrust::pair<uint32_t, uint32_t>& pair2) {
-                 return pair1.second < pair2.second;
+
+  auto zip_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin(), poly_ids.begin()));
+  auto zip_end =
+      thrust::make_zip_iterator(thrust::make_tuple(point_ids.end(), poly_ids.end()));
+  auto invalid_tuple = thrust::make_tuple(std::numeric_limits<INDEX_T>::max(),
+                                          std::numeric_limits<INDEX_T>::max());
+
+  // Sort by polygon id
+  thrust::sort(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+               [] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu1,
+                             const thrust::tuple<INDEX_T, INDEX_T>& tu2) {
+                 return thrust::get<1>(tu1) < thrust::get<1>(tu2);
                });
 
-  rmm::device_uvector<uint32_t> poly_ids(ids_size, stream);
+  rmm::device_uvector<INDEX_T> uniq_poly_ids(ids_size, stream);
 
-  thrust::transform(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-                    poly_ids.data(),
-                    [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                      return pair.second;
-                    });
-  auto poly_ids_end =
-      thrust::unique(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end());
-  poly_ids.resize(thrust::distance(poly_ids.begin(), poly_ids_end), stream);
-  poly_ids.shrink_to_fit(stream);
+  thrust::copy(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end(),
+               uniq_poly_ids.begin());
 
-  auto bvh_bytes = EstimateBVHSize(stream, poly_array, ArrayView<uint32_t>(poly_ids));
-  size_t avail_bytes = rmm::available_device_memory().first * config_.memory_quota;
+  // Collect uniq polygon ids to estimate total BVH memory usage
+  auto uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
+                                          uniq_poly_ids.begin(), uniq_poly_ids.end());
+  uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), uniq_poly_ids_end),
+                       stream);
+  uniq_poly_ids.shrink_to_fit(stream);
+
+  auto bvh_bytes = EstimateBVHSize(stream, poly_array, ArrayView<uint32_t>(uniq_poly_ids),
+                                   config_.segs_per_aabb);
+  size_t avail_bytes =
+      MemoryManager::instance().get_available_device_memory() * config_.memory_quota;
   auto n_batches = bvh_bytes / avail_bytes + 1;
   auto batch_size = (ids_size + n_batches - 1) / n_batches;
-  auto invalid_pair = thrust::make_pair(std::numeric_limits<uint32_t>::max(),
-                                        std::numeric_limits<uint32_t>::max());
 
   GPUSPATIAL_LOG_INFO(
       "Unique polygons %zu, memory quota %zu MB, estimated BVH size %zu MB",
-      poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
+      uniq_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
 
   for (int batch = 0; batch < n_batches; batch++) {
     auto ids_begin = batch * batch_size;
     auto ids_end = std::min(ids_begin + batch_size, ids_size);
     auto ids_size_batch = ids_end - ids_begin;
 
-    poly_ids.resize(ids_size_batch, stream);
-    thrust::transform(rmm::exec_policy_nosync(stream), ids.data() + ids_begin,
-                      ids.data() + ids_end, poly_ids.data(),
-                      [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                        return pair.second;
-                      });
+    // Extract unique polygon IDs in this batch
+    uniq_poly_ids.resize(ids_size_batch, stream);
+    thrust::copy(rmm::exec_policy_nosync(stream), poly_ids.begin() + ids_begin,
+                 poly_ids.begin() + ids_end, uniq_poly_ids.begin());
 
-    // ids is sorted
-    poly_ids_end =
-        thrust::unique(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end());
+    // poly ids are sorted
+    uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
+                                       uniq_poly_ids.begin(), uniq_poly_ids.end());
 
-    poly_ids.resize(thrust::distance(poly_ids.begin(), poly_ids_end), stream);
-    poly_ids.shrink_to_fit(stream);
+    uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), uniq_poly_ids_end),
+                         stream);
+    uniq_poly_ids.shrink_to_fit(stream);
 
     rmm::device_uvector<int> IMs(ids_size_batch, stream);
-    rmm::device_uvector<INDEX_T> seg_begins(0, stream);
     rmm::device_uvector<PointLocation> locations(ids_size_batch, stream);
     rmm::device_buffer bvh_buffer(0, stream);
     rmm::device_uvector<INDEX_T> aabb_poly_ids(0, stream), aabb_ring_ids(0, stream);
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>> aabb_vertex_offsets(0, stream);
 
     // aabb id -> vertex begin[polygon] + ith point in this polygon
-    auto handle = BuildBVH(stream, poly_array, ArrayView<INDEX_T>(poly_ids), seg_begins,
-                           bvh_buffer, aabb_poly_ids, aabb_ring_ids);
+    auto handle = BuildBVH(stream, poly_array, ArrayView<INDEX_T>(uniq_poly_ids),
+                           config_.segs_per_aabb, bvh_buffer, aabb_poly_ids,
+                           aabb_ring_ids, aabb_vertex_offsets);
 
     params_t params;
 
     params.points = point_array;
     params.multi_points = multi_point_array;
     params.polygons = poly_array;
-    params.polygon_ids = ArrayView<INDEX_T>(poly_ids);
-    params.ids = ArrayView<thrust::pair<uint32_t, uint32_t>>(ids.data() + ids_begin,
-                                                             ids_size_batch);
-    params.seg_begins = ArrayView<INDEX_T>(seg_begins);
+    params.uniq_polygon_ids = ArrayView<INDEX_T>(uniq_poly_ids);
+    params.query_point_ids = point_ids.data() + ids_begin;
+    params.query_polygon_ids = poly_ids.data() + ids_begin;
+    params.query_size = ids_size_batch;
     params.IMs = ArrayView<int>(IMs);
     params.handle = handle;
     params.aabb_poly_ids = ArrayView<INDEX_T>(aabb_poly_ids);
     params.aabb_ring_ids = ArrayView<INDEX_T>(aabb_ring_ids);
+    params.aabb_vertex_offsets =
+        ArrayView<thrust::pair<INDEX_T, INDEX_T>>(aabb_vertex_offsets);
 
     rmm::device_buffer params_buffer(sizeof(params_t), stream);
 
@@ -457,34 +529,32 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
         dim3{static_cast<unsigned int>(ids_size_batch), 1, 1},
         ArrayView<char>((char*)params_buffer.data(), params_buffer.size()));
 
-    auto* p_IMs = IMs.data();
-    auto* p_ids = ids.data();
-
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<uint32_t>(0),
-                      thrust::make_counting_iterator<uint32_t>(ids_size_batch),
-                      ids.data() + ids_begin, [=] __device__(uint32_t i) {
-                        const auto& pair = p_ids[ids_begin + i];
-
-                        auto IM = p_IMs[i];
-                        if (inverse) {
-                          IM = IntersectionMatrix::Transpose(IM);
-                        }
-                        if (detail::EvaluatePredicate(predicate, IM)) {
-                          return pair;
-                        } else {
-                          return invalid_pair;
-                        }
-                      });
+    thrust::transform(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_begin, poly_ids.begin() + ids_begin, IMs.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_end, poly_ids.begin() + ids_end, IMs.end())),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     poly_ids.begin() + ids_begin)),
+        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T, int>& t) {
+          auto res = thrust::make_tuple(thrust::get<0>(t), thrust::get<1>(t));
+          auto IM = thrust::get<2>(t);
+
+          if (inverse) {
+            IM = IntersectionMatrix::Transpose(IM);
+          }
+
+          return detail::EvaluatePredicate(predicate, IM) ? res : invalid_tuple;
+        });
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        return pair == invalid_pair;
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto end = thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                               [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu) {
+                                 return tu == invalid_tuple;
+                               });
+  size_t new_size = thrust::distance(zip_begin, end);
+  point_ids.resize(new_size, stream);
+  poly_ids.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -493,11 +563,12 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     const PointArrayView<POINT_T, INDEX_T>& point_array,
     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_poly_array, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids, bool inverse) {
+    rmm::device_uvector<INDEX_T>& point_ids, rmm::device_uvector<INDEX_T>& multi_poly_ids,
+    bool inverse) {
   using params_t = detail::LaunchParamsPointMultiPolygonQuery<POINT_T, INDEX_T>;
-
   assert(point_array.empty() || multi_point_array.empty());
-  size_t ids_size = ids.size(stream);
+  assert(point_ids.size() == multi_poly_ids.size());
+  size_t ids_size = point_ids.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with ray-tracing, (multi-)point %zu, multi-polygon %zu, predicate %s, result size %zu, inverse %d",
       !point_array.empty() ? point_array.size() : multi_point_array.size(),
@@ -506,37 +577,44 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
   if (ids_size == 0) {
     return;
   }
-  // pair.first is point id; pair.second is multi polygon id
-  // Sort by multi polygon id
-  thrust::sort(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-               [] __device__(const thrust::pair<uint32_t, uint32_t>& pair1,
-                             const thrust::pair<uint32_t, uint32_t>& pair2) {
-                 return pair1.second < pair2.second;
+  auto zip_begin = thrust::make_zip_iterator(
+      thrust::make_tuple(point_ids.begin(), multi_poly_ids.begin()));
+  auto zip_end = thrust::make_zip_iterator(
+      thrust::make_tuple(point_ids.end(), multi_poly_ids.end()));
+  auto invalid_tuple = thrust::make_tuple(std::numeric_limits<INDEX_T>::max(),
+                                          std::numeric_limits<INDEX_T>::max());
+
+  // Sort by polygon id
+  thrust::sort(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+               [] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu1,
+                             const thrust::tuple<INDEX_T, INDEX_T>& tu2) {
+                 return thrust::get<1>(tu1) < thrust::get<1>(tu2);
                });
 
-  rmm::device_uvector<uint32_t> multi_poly_ids(ids_size, stream);
+  rmm::device_uvector<uint32_t> uniq_multi_poly_ids(ids_size, stream);
 
-  thrust::transform(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-                    multi_poly_ids.data(),
-                    [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                      return pair.second;
-                    });
-  auto multi_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
-                                           multi_poly_ids.begin(), multi_poly_ids.end());
-  multi_poly_ids.resize(thrust::distance(multi_poly_ids.begin(), multi_poly_ids_end),
-                        stream);
-  multi_poly_ids.shrink_to_fit(stream);
+  thrust::copy(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
+               multi_poly_ids.end(), uniq_multi_poly_ids.begin());
+
+  // Collect uniq polygon ids to estimate total BVH memory usage
+  auto uniq_multi_poly_ids_end =
+      thrust::unique(rmm::exec_policy_nosync(stream), uniq_multi_poly_ids.begin(),
+                     uniq_multi_poly_ids.end());
+  uniq_multi_poly_ids.resize(
+      thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end), stream);
+  uniq_multi_poly_ids.shrink_to_fit(stream);
 
   auto bvh_bytes =
-      EstimateBVHSize(stream, multi_poly_array, ArrayView<uint32_t>(multi_poly_ids));
-  size_t avail_bytes = rmm::available_device_memory().first * config_.memory_quota;
+      EstimateBVHSize(stream, multi_poly_array, ArrayView<uint32_t>(uniq_multi_poly_ids),
+                      config_.segs_per_aabb);
+  size_t avail_bytes =
+      MemoryManager::instance().get_available_device_memory() * config_.memory_quota;
   auto n_batches = bvh_bytes / avail_bytes + 1;
   auto batch_size = (ids_size + n_batches - 1) / n_batches;
-  auto invalid_pair = thrust::make_pair(std::numeric_limits<uint32_t>::max(),
-                                        std::numeric_limits<uint32_t>::max());
+
   GPUSPATIAL_LOG_INFO(
       "Unique multi-polygons %zu, memory quota %zu MB, estimated BVH size %zu MB",
-      multi_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
+      uniq_multi_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
 
   for (int batch = 0; batch < n_batches; batch++) {
     auto ids_begin = batch * batch_size;
@@ -544,47 +622,48 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     auto ids_size_batch = ids_end - ids_begin;
 
     // Extract multi polygon IDs in this batch
-    multi_poly_ids.resize(ids_size_batch, stream);
+    uniq_multi_poly_ids.resize(ids_size_batch, stream);
 
-    thrust::transform(rmm::exec_policy_nosync(stream), ids.data() + ids_begin,
-                      ids.data() + ids_end, multi_poly_ids.data(),
-                      [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                        return pair.second;
-                      });
+    thrust::copy(rmm::exec_policy_nosync(stream), multi_poly_ids.begin() + ids_begin,
+                 multi_poly_ids.begin() + ids_end, uniq_multi_poly_ids.begin());
 
     // multi polygon ids have been sorted before
-    multi_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
-                                        multi_poly_ids.begin(), multi_poly_ids.end());
-    multi_poly_ids.resize(thrust::distance(multi_poly_ids.begin(), multi_poly_ids_end),
-                          stream);
-    multi_poly_ids.shrink_to_fit(stream);
+    uniq_multi_poly_ids_end =
+        thrust::unique(rmm::exec_policy_nosync(stream), uniq_multi_poly_ids.begin(),
+                       uniq_multi_poly_ids.end());
+    uniq_multi_poly_ids.resize(
+        thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end), stream);
+    uniq_multi_poly_ids.shrink_to_fit(stream);
 
     rmm::device_uvector<int> IMs(ids_size_batch, stream);
-    rmm::device_uvector<INDEX_T> seg_begins(0, stream);
-    rmm::device_uvector<INDEX_T> uniq_part_begins(0, stream);
     rmm::device_buffer bvh_buffer(0, stream);
     rmm::device_uvector<INDEX_T> aabb_multi_poly_ids(0, stream), aabb_part_ids(0, stream),
         aabb_ring_ids(0, stream);
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>> aabb_vertex_offsets(0, stream);
+    rmm::device_uvector<INDEX_T> uniq_part_begins(0, stream);
 
-    auto handle = BuildBVH(stream, multi_poly_array, ArrayView<INDEX_T>(multi_poly_ids),
-                           seg_begins, uniq_part_begins, bvh_buffer, aabb_multi_poly_ids,
-                           aabb_part_ids, aabb_ring_ids);
+    auto handle =
+        BuildBVH(stream, multi_poly_array, ArrayView<INDEX_T>(uniq_multi_poly_ids),
+                 config_.segs_per_aabb, bvh_buffer, aabb_multi_poly_ids, aabb_part_ids,
+                 aabb_ring_ids, aabb_vertex_offsets, uniq_part_begins);
 
     params_t params;
 
     params.points = point_array;
     params.multi_points = multi_point_array;
     params.multi_polygons = multi_poly_array;
-    params.multi_polygon_ids = ArrayView<INDEX_T>(multi_poly_ids);
-    params.ids = ArrayView<thrust::pair<uint32_t, uint32_t>>(ids.data() + ids_begin,
-                                                             ids_size_batch);
-    params.seg_begins = ArrayView<INDEX_T>(seg_begins);
+    params.uniq_multi_polygon_ids = ArrayView<INDEX_T>(uniq_multi_poly_ids);
+    params.query_point_ids = point_ids.data() + ids_begin;
+    params.query_multi_polygon_ids = multi_poly_ids.data() + ids_begin;
+    params.query_size = ids_size_batch;
     params.uniq_part_begins = ArrayView<INDEX_T>(uniq_part_begins);
     params.IMs = ArrayView<int>(IMs);
     params.handle = handle;
     params.aabb_multi_poly_ids = ArrayView<INDEX_T>(aabb_multi_poly_ids);
     params.aabb_part_ids = ArrayView<INDEX_T>(aabb_part_ids);
     params.aabb_ring_ids = ArrayView<INDEX_T>(aabb_ring_ids);
+    params.aabb_vertex_offsets =
+        ArrayView<thrust::pair<INDEX_T, INDEX_T>>(aabb_vertex_offsets);
 
     rmm::device_buffer params_buffer(sizeof(params_t), stream);
 
@@ -596,166 +675,90 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
         dim3{static_cast<unsigned int>(ids_size_batch), 1, 1},
         ArrayView<char>((char*)params_buffer.data(), params_buffer.size()));
 
-    auto* p_IMs = IMs.data();
-    auto* p_ids = ids.data();
-
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<uint32_t>(0),
-                      thrust::make_counting_iterator<uint32_t>(ids_size_batch),
-                      ids.data() + ids_begin, [=] __device__(uint32_t i) {
-                        const auto& pair = p_ids[ids_begin + i];
-
-                        auto IM = p_IMs[i];
-                        if (inverse) {
-                          IM = IntersectionMatrix::Transpose(IM);
-                        }
-                        if (detail::EvaluatePredicate(predicate, IM)) {
-                          return pair;
-                        } else {
-                          return invalid_pair;
-                        }
-                      });
+    thrust::transform(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     multi_poly_ids.begin() + ids_begin,
+                                                     IMs.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_end, multi_poly_ids.begin() + ids_end, IMs.end())),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     multi_poly_ids.begin() + ids_begin)),
+        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T, int>& t) {
+          auto res = thrust::make_tuple(thrust::get<0>(t), thrust::get<1>(t));
+          auto IM = thrust::get<2>(t);
+
+          if (inverse) {
+            IM = IntersectionMatrix::Transpose(IM);
+          }
+
+          return detail::EvaluatePredicate(predicate, IM) ? res : invalid_tuple;
+        });
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        return pair == invalid_pair;
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto end = thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                               [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu) {
+                                 return tu == invalid_tuple;
+                               });
+  size_t new_size = thrust::distance(zip_begin, end);
+  point_ids.resize(new_size, stream);
+  multi_poly_ids.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
 size_t RelateEngine<POINT_T, INDEX_T>::EstimateBVHSize(
     const rmm::cuda_stream_view& stream, const PolygonArrayView<POINT_T, INDEX_T>& polys,
-    ArrayView<uint32_t> poly_ids) {
-  auto n_polygons = poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
-      auto id = poly_ids[i];
-      const auto& polygon = polys[id];
-      uint32_t total_segs = 0;
-
-      for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-        total_segs += polygon.get_ring(ring).num_points();
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-  auto total_segs =
-      thrust::reduce(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end());
-  if (total_segs == 0) {
+    ArrayView<uint32_t> poly_ids, int segs_per_aabb) {
+  auto num_aabbs = detail::ComputeNumAabbs(stream, polys, poly_ids, segs_per_aabb);
+  if (num_aabbs == 0) {
     return 0;
   }
+
   // temporary but still needed to consider this part of memory
-  auto aabb_size = total_segs * sizeof(OptixAabb);
+  auto aabb_size = num_aabbs * sizeof(OptixAabb);
   auto bvh_bytes = rt_engine_->EstimateMemoryUsageForAABB(
-      total_segs, config_.bvh_fast_build, config_.bvh_fast_compact);
-  // BVH size and aabb_poly_ids, aabb_ring_ids
-  return aabb_size + bvh_bytes + 2 * sizeof(INDEX_T) * total_segs;
+      num_aabbs, config_.bvh_fast_build, config_.bvh_compact);
+  // BVH size and aabb_poly_ids, aabb_ring_ids, aabb_vertex_offsets
+  return aabb_size + bvh_bytes + 4 * sizeof(INDEX_T) * num_aabbs;
 }
 
 template <typename POINT_T, typename INDEX_T>
 size_t RelateEngine<POINT_T, INDEX_T>::EstimateBVHSize(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-    ArrayView<uint32_t> multi_poly_ids) {
-  auto n_mult_polygons = multi_poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_mult_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
-      auto id = multi_poly_ids[i];
-      const auto& multi_polygon = multi_polys[id];
-      uint32_t total_segs = 0;
+    ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb) {
+  auto num_aabbs =
+      detail::ComputeNumAabbs(stream, multi_polys, multi_poly_ids, segs_per_aabb);
 
-      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
-        auto polygon = multi_polygon.get_polygon(part_idx);
-        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-          total_segs += polygon.get_ring(ring).num_points();
-        }
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-  auto total_segs =
-      thrust::reduce(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end());
-  if (total_segs == 0) {
-    return 0;
-  }
   // temporary but still needed to consider this part of memory
-  auto aabb_size = total_segs * sizeof(OptixAabb);
+  auto aabb_size = num_aabbs * sizeof(OptixAabb);
   auto bvh_bytes = rt_engine_->EstimateMemoryUsageForAABB(
-      total_segs, config_.bvh_fast_build, config_.bvh_fast_compact);
-  // BVH size and aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids
-  return aabb_size + bvh_bytes + 3 * sizeof(INDEX_T) * total_segs;
+      num_aabbs, config_.bvh_fast_build, config_.bvh_compact);
+  // BVH size and aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids, aabb_vertex_offsets
+  return aabb_size + bvh_bytes + 5 * sizeof(INDEX_T) * num_aabbs;
 }
 
 template <typename POINT_T, typename INDEX_T>
 OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& polygons, ArrayView<uint32_t> polygon_ids,
-    rmm::device_uvector<INDEX_T>& seg_begins, rmm::device_buffer& buffer,
+    int segs_per_aabb, rmm::device_buffer& buffer,
     rmm::device_uvector<INDEX_T>& aabb_poly_ids,
-    rmm::device_uvector<INDEX_T>& aabb_ring_ids) {
+    rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets) {
   auto n_polygons = polygon_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_polygons, stream);
-
-  // TODO: warp reduce
-  thrust::transform(rmm::exec_policy_nosync(stream), polygon_ids.begin(),
-                    polygon_ids.end(), n_segs.begin(),
-                    [=] __device__(const uint32_t& id) -> uint32_t {
-                      const auto& polygon = polygons[id];
-                      uint32_t total_segs = 0;
-
-                      for (int ring = 0; ring < polygon.num_rings(); ring++) {
-                        total_segs += polygon.get_ring(ring).num_points();
-                      }
-                      return total_segs;
-                    });
-
-  seg_begins = std::move(rmm::device_uvector<INDEX_T>(n_polygons + 1, stream));
-  auto* p_seg_begins = seg_begins.data();
-  seg_begins.set_element_to_zero_async(0, stream);
-
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end(),
-                         seg_begins.begin() + 1);
-
-  uint32_t num_aabbs = seg_begins.back_element(stream);
-
+  auto num_aabbs = detail::ComputeNumAabbs(stream, polygons, polygon_ids, segs_per_aabb);
   aabb_poly_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
   aabb_ring_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
+  aabb_vertex_offsets =
+      std::move(rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>(num_aabbs, stream));
 
-  auto* p_poly_ids = aabb_poly_ids.data();
-  auto* p_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_poly_ids = aabb_poly_ids.data();
+  auto* p_aabb_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_vertex_offsets = aabb_vertex_offsets.data();
 
-  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
-  auto* p_aabbs = aabbs.data();
+  rmm::device_scalar<uint32_t> d_tail(0, stream);
+
+  auto* p_tail = d_tail.data();
 
   LaunchKernel(stream.value(), [=] __device__() {
     auto lane = threadIdx.x % 32;
@@ -763,191 +766,222 @@ OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     auto n_warps = TOTAL_THREADS_1D / 32;
 
     // each warp takes a polygon
-    // i is the renumbered polygon id starting from 0
     for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
       auto poly_id = polygon_ids[i];
       const auto& polygon = polygons[poly_id];
-      auto tail = p_seg_begins[i];
 
       // entire warp sequentially visit each ring
       for (uint32_t ring_idx = 0; ring_idx < polygon.num_rings(); ring_idx++) {
         auto ring = polygon.get_ring(ring_idx);
-        // this is like a hash function, its okay to overflow
-        OptixAabb aabb;
-        aabb.minZ = aabb.maxZ = i;
-
-        // each lane takes a seg
-        for (auto seg_idx = lane; seg_idx < ring.num_segments(); seg_idx += 32) {
-          const auto& seg = ring.get_line_segment(seg_idx);
-          const auto& p1 = seg.get_p1();
-          const auto& p2 = seg.get_p2();
-
-          aabb.minX = std::min(p1.x(), p2.x());
-          aabb.maxX = std::max(p1.x(), p2.x());
-          aabb.minY = std::min(p1.y(), p2.y());
-          aabb.maxY = std::max(p1.y(), p2.y());
-
-          if (std::is_same_v<scalar_t, double>) {
-            aabb.minX = next_float_from_double(aabb.minX, -1, 2);
-            aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
-            aabb.minY = next_float_from_double(aabb.minY, -1, 2);
-            aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
-          }
-          p_aabbs[tail + seg_idx] = aabb;
-          p_poly_ids[tail + seg_idx] = poly_id;
-          p_ring_ids[tail + seg_idx] = ring_idx;
+        auto aabbs_per_ring = (ring.num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+        // e.g., num segs = 3, segs_per_aabb = 2
+        // The first aabb covers seg 0,1, with vertex id (0,1,2)
+        // The second aabb covers seg 2, with vertex id (2,3)
+        // each lane takes an aabb
+        for (auto aabb_idx = lane; aabb_idx < aabbs_per_ring; aabb_idx += 32) {
+          INDEX_T local_vertex_begin = aabb_idx * segs_per_aabb;
+          INDEX_T local_vertex_end =
+              std::min((INDEX_T)(local_vertex_begin + segs_per_aabb),
+                       (INDEX_T)ring.num_segments());
+
+          auto tail = atomicAdd(p_tail, 1);
+
+          assert(tail < num_aabbs);
+          p_aabb_poly_ids[tail] = poly_id;
+          p_aabb_ring_ids[tail] = ring_idx;
+          p_aabb_vertex_offsets[tail] =
+              thrust::make_pair(local_vertex_begin, local_vertex_end);
         }
-        tail += ring.num_segments();
-        // fill a dummy AABB, so we have aabb-vertex one-to-one relationship
-        if (lane == 0) {
-          p_aabbs[tail] = OptixAabb{0, 0, 0, 0, 0, 0};
-        }
-        tail++;
       }
-      assert(p_seg_begins[i + 1] == tail);
     }
   });
+  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
+
+  // Fill AABBs
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<uint32_t>(0),
+                    thrust::make_counting_iterator<uint32_t>(num_aabbs), aabbs.begin(),
+                    [=] __device__(const uint32_t& aabb_idx) {
+                      OptixAabb aabb;
+                      aabb.minX = std::numeric_limits<scalar_t>::max();
+                      aabb.minY = std::numeric_limits<scalar_t>::max();
+                      aabb.maxX = std::numeric_limits<scalar_t>::lowest();
+                      aabb.maxY = std::numeric_limits<scalar_t>::lowest();
+
+                      auto poly_id = p_aabb_poly_ids[aabb_idx];
+                      auto ring_id = p_aabb_ring_ids[aabb_idx];
+                      auto vertex_offset_pair = p_aabb_vertex_offsets[aabb_idx];
+                      const auto& polygon = polygons[poly_id];
+                      const auto& ring = polygon.get_ring(ring_id);
+
+                      for (auto vidx = vertex_offset_pair.first;
+                           vidx <= vertex_offset_pair.second; vidx++) {
+                        const auto& v = ring.get_point(vidx);
+                        float x = v.x();
+                        float y = v.y();
+
+                        aabb.minX = fminf(aabb.minX, x);
+                        aabb.maxX = fmaxf(aabb.maxX, x);
+                        aabb.minY = fminf(aabb.minY, y);
+                        aabb.maxY = fmaxf(aabb.maxY, y);
+                      }
+
+                      if (std::is_same_v<scalar_t, double>) {
+                        aabb.minX = next_float_from_double(aabb.minX, -1, 2);
+                        aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
+                        aabb.minY = next_float_from_double(aabb.minY, -1, 2);
+                        aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
+                      }
+                      // Using minZ/maxZ to store polygon id for better filtering
+                      // Refer to polygon_point_query.cu
+                      aabb.minZ = aabb.maxZ = poly_id;
+                      return aabb;
+                    });
+
   assert(rt_engine_ != nullptr);
   return rt_engine_->BuildAccelCustom(stream.value(), ArrayView<OptixAabb>(aabbs), buffer,
-                                      config_.bvh_fast_build, config_.bvh_fast_compact);
+                                      config_.bvh_fast_build, config_.bvh_compact);
 }
 
 template <typename POINT_T, typename INDEX_T>
 OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-    ArrayView<uint32_t> multi_poly_ids, rmm::device_uvector<INDEX_T>& seg_begins,
-    rmm::device_uvector<INDEX_T>& part_begins, rmm::device_buffer& buffer,
+    ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb, rmm::device_buffer& buffer,
     rmm::device_uvector<INDEX_T>& aabb_multi_poly_ids,
     rmm::device_uvector<INDEX_T>& aabb_part_ids,
-    rmm::device_uvector<INDEX_T>& aabb_ring_ids) {
+    rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets,
+    rmm::device_uvector<INDEX_T>& part_begins) {
   auto n_mult_polygons = multi_poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_mult_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
-      auto id = multi_poly_ids[i];
-      const auto& multi_polygon = multi_polys[id];
-      uint32_t total_segs = 0;
-
-      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
-        auto polygon = multi_polygon.get_polygon(part_idx);
-        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-          total_segs += polygon.get_ring(ring).num_points();
-        }
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-
-  seg_begins = std::move(rmm::device_uvector<INDEX_T>(n_mult_polygons + 1, stream));
-  auto* p_seg_begins = seg_begins.data();
-  seg_begins.set_element_to_zero_async(0, stream);
-
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end(),
-                         seg_begins.begin() + 1);
 
-  // each line seg is corresponding to an AABB and each ring includes an empty AABB
-  uint32_t num_aabbs = seg_begins.back_element(stream);
+  auto num_aabbs =
+      detail::ComputeNumAabbs(stream, multi_polys, multi_poly_ids, segs_per_aabb);
+  if (num_aabbs == 0) {
+    return 0;
+  }
 
   aabb_multi_poly_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
   aabb_part_ids = std::move(rmm::device_uvector<uint32_t>(num_aabbs, stream));
   aabb_ring_ids = std::move(rmm::device_uvector<uint32_t>(num_aabbs, stream));
+  aabb_vertex_offsets =
+      std::move(rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>(num_aabbs, stream));
+  rmm::device_uvector<INDEX_T> aabb_seq_ids(num_aabbs, stream);
 
-  auto* p_multi_poly_ids = aabb_multi_poly_ids.data();
-  auto* p_part_ids = aabb_part_ids.data();
-  auto* p_ring_ids = aabb_ring_ids.data();
-
-  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
-  auto* p_aabbs = aabbs.data();
-
-  rmm::device_uvector<uint32_t> num_parts(n_mult_polygons, stream);
+  auto* p_aabb_multi_poly_ids = aabb_multi_poly_ids.data();
+  auto* p_aabb_part_ids = aabb_part_ids.data();
+  auto* p_aabb_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_vertex_offsets = aabb_vertex_offsets.data();
+  auto* p_aabb_seq_ids = aabb_seq_ids.data();
 
-  thrust::transform(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
-                    multi_poly_ids.end(), num_parts.begin(), [=] __device__(uint32_t id) {
-                      const auto& multi_polygon = multi_polys[id];
-                      return multi_polygon.num_polygons();
-                    });
+  rmm::device_scalar<uint32_t> d_tail(0, stream);
 
-  part_begins = std::move(rmm::device_uvector<uint32_t>(n_mult_polygons + 1, stream));
-  auto* p_part_begins = part_begins.data();
-  part_begins.set_element_to_zero_async(0, stream);
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), num_parts.begin(),
-                         num_parts.end(), part_begins.begin() + 1);
-  num_parts.resize(0, stream);
-  num_parts.shrink_to_fit(stream);
+  auto* p_tail = d_tail.data();
 
   LaunchKernel(stream.value(), [=] __device__() {
     auto lane = threadIdx.x % 32;
     auto global_warp_id = TID_1D / 32;
     auto n_warps = TOTAL_THREADS_1D / 32;
 
-    // each warp takes a multi polygon
-    // i is the renumbered polygon id starting from 0
+    // each warp takes a polygon
     for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
       auto multi_poly_id = multi_poly_ids[i];
       const auto& multi_polygon = multi_polys[multi_poly_id];
-      auto tail = p_seg_begins[i];
 
-      // entire warp sequentially visit each part
       for (uint32_t part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
         auto polygon = multi_polygon.get_polygon(part_idx);
-
         // entire warp sequentially visit each ring
         for (uint32_t ring_idx = 0; ring_idx < polygon.num_rings(); ring_idx++) {
           auto ring = polygon.get_ring(ring_idx);
-          // this is like a hash function, its okay to overflow
-          OptixAabb aabb;
-          aabb.minZ = aabb.maxZ = p_part_begins[i] + part_idx;
-
-          // each lane takes a seg
-          for (auto seg_idx = lane; seg_idx < ring.num_segments(); seg_idx += 32) {
-            const auto& seg = ring.get_line_segment(seg_idx);
-            const auto& p1 = seg.get_p1();
-            const auto& p2 = seg.get_p2();
-
-            aabb.minX = std::min(p1.x(), p2.x());
-            aabb.maxX = std::max(p1.x(), p2.x());
-            aabb.minY = std::min(p1.y(), p2.y());
-            aabb.maxY = std::max(p1.y(), p2.y());
-
-            if (std::is_same_v<scalar_t, double>) {
-              aabb.minX = next_float_from_double(aabb.minX, -1, 2);
-              aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
-              aabb.minY = next_float_from_double(aabb.minY, -1, 2);
-              aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
-            }
-            p_aabbs[tail + seg_idx] = aabb;
-            p_multi_poly_ids[tail + seg_idx] = multi_poly_id;
-            p_part_ids[tail + seg_idx] = part_idx;
-            p_ring_ids[tail + seg_idx] = ring_idx;
-          }
-          tail += ring.num_segments();
-          // fill a dummy AABB, so we have aabb-vertex one-to-one relationship
-          if (lane == 0) {
-            p_aabbs[tail] = OptixAabb{0, 0, 0, 0, 0, 0};
+          auto aabbs_per_ring = (ring.num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+          // e.g., num segs = 3, segs_per_aabb = 2
+          // The first aabb covers seg 0,1, with vertex id (0,1,2)
+          // The second aabb covers seg 2, with vertex id (2,3)
+          // each lane takes an aabb
+          for (auto aabb_idx = lane; aabb_idx < aabbs_per_ring; aabb_idx += 32) {
+            INDEX_T local_vertex_begin = aabb_idx * segs_per_aabb;
+            INDEX_T local_vertex_end =
+                std::min((INDEX_T)(local_vertex_begin + segs_per_aabb),
+                         (INDEX_T)ring.num_segments());
+
+            auto tail = atomicAdd(p_tail, 1);
+
+            assert(tail < num_aabbs);
+            p_aabb_multi_poly_ids[tail] = multi_poly_id;
+            p_aabb_part_ids[tail] = part_idx;
+            p_aabb_ring_ids[tail] = ring_idx;
+            p_aabb_vertex_offsets[tail] =
+                thrust::make_pair(local_vertex_begin, local_vertex_end);
+            p_aabb_seq_ids[tail] = i;
           }
-          tail++;
         }
       }
-      assert(p_seg_begins[i + 1] == tail);
     }
   });
 
+  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
+  part_begins = std::move(rmm::device_uvector<uint32_t>(n_mult_polygons + 1, stream));
+  auto* p_part_begins = part_begins.data();
+  part_begins.set_element_to_zero_async(0, stream);
+  rmm::device_uvector<uint32_t> num_parts(n_mult_polygons, stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
+                    multi_poly_ids.end(), num_parts.begin(), [=] __device__(uint32_t id) {
+                      const auto& multi_polygon = multi_polys[id];
+                      return multi_polygon.num_polygons();
+                    });
+
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), num_parts.begin(),
+                         num_parts.end(), part_begins.begin() + 1);
+  num_parts.resize(0, stream);
+  num_parts.shrink_to_fit(stream);
+
+  // Fill AABBs
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<uint32_t>(0),
+                    thrust::make_counting_iterator<uint32_t>(num_aabbs), aabbs.begin(),
+                    [=] __device__(const uint32_t& aabb_idx) {
+                      OptixAabb aabb;
+                      aabb.minX = std::numeric_limits<scalar_t>::max();
+                      aabb.minY = std::numeric_limits<scalar_t>::max();
+                      aabb.maxX = std::numeric_limits<scalar_t>::lowest();
+                      aabb.maxY = std::numeric_limits<scalar_t>::lowest();
+
+                      auto multi_poly_id = p_aabb_multi_poly_ids[aabb_idx];
+                      auto part_id = p_aabb_part_ids[aabb_idx];
+                      auto ring_id = p_aabb_ring_ids[aabb_idx];
+                      auto vertex_offset_pair = p_aabb_vertex_offsets[aabb_idx];
+                      auto seq_id = p_aabb_seq_ids[aabb_idx];
+                      auto multi_polygon = multi_polys[multi_poly_id];
+                      const auto& polygon = multi_polygon.get_polygon(part_id);
+                      const auto& ring = polygon.get_ring(ring_id);
+
+                      for (auto vidx = vertex_offset_pair.first;
+                           vidx <= vertex_offset_pair.second; vidx++) {
+                        const auto& v = ring.get_point(vidx);
+                        float x = v.x();
+                        float y = v.y();
+
+                        aabb.minX = fminf(aabb.minX, x);
+                        aabb.maxX = fmaxf(aabb.maxX, x);
+                        aabb.minY = fminf(aabb.minY, y);
+                        aabb.maxY = fmaxf(aabb.maxY, y);
+                      }
+
+                      if (std::is_same_v<scalar_t, double>) {
+                        aabb.minX = next_float_from_double(aabb.minX, -1, 2);
+                        aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
+                        aabb.minY = next_float_from_double(aabb.minY, -1, 2);
+                        aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
+                      }
+
+                      aabb.minZ = aabb.maxZ = p_part_begins[seq_id] + part_id;
+                      return aabb;
+                    });
   assert(rt_engine_ != nullptr);
+
   return rt_engine_->BuildAccelCustom(stream.value(), ArrayView<OptixAabb>(aabbs), buffer,
-                                      config_.bvh_fast_build, config_.bvh_fast_compact);
+                                      config_.bvh_fast_build, config_.bvh_compact);
 }
 // Explicitly instantiate the template for specific types
 template class RelateEngine<Point<double, 2>, uint32_t>;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp b/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
index 7596e0cb3..9857be56c 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
@@ -14,12 +14,12 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/rt_engine.hpp"
-#include "gpuspatial/utils/cuda_utils.h"
-#include "gpuspatial/utils/exception.h"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
+#include "gpuspatial/utils/exception.hpp"
 #include "gpuspatial/utils/logger.hpp"
 
-#include "rt/shaders/shader_config.h"
+#include "rt/shaders/shader_config.hpp"
 
 #include "rmm/device_scalar.hpp"
 
@@ -57,8 +57,6 @@ void context_log_cb(unsigned int level, const char* tag, const char* message, vo
 }  // namespace
 
 namespace gpuspatial {
-namespace details {
-
 // --- RTConfig Method Definitions ---
 
 void RTConfig::AddModule(const Module& mod) {
@@ -103,6 +101,12 @@ RTConfig get_default_rt_config(const std::string& ptx_root) {
 RTEngine::RTEngine() : initialized_(false) {}
 
 RTEngine::~RTEngine() {
+  cudaError_t probe = cudaPeekAtLastError();
+
+  if (probe == cudaErrorCudartUnloading) {
+    GPUSPATIAL_LOG_ERROR("CUDA runtime is unloaded");
+    return;
+  }
   if (initialized_) {
     releaseOptixResources();
   }
@@ -112,6 +116,7 @@ void RTEngine::Init(const RTConfig& config) {
   if (initialized_) {
     releaseOptixResources();
   }
+  GPUSPATIAL_LOG_INFO("Initialize RTEngine");
   initOptix(config);
   createContext();
   createModule(config);
@@ -163,32 +168,34 @@ OptixTraversableHandle RTEngine::BuildAccelCustom(cudaStream_t cuda_stream,
   OPTIX_CHECK(optixAccelComputeMemoryUsage(optix_context_, &accelOptions, &build_input, 1,
                                            &blas_buffer_sizes));
 
-  GPUSPATIAL_LOG_INFO(
+  GPUSPATIAL_LOG_DEBUG(
       "ComputeBVHMemoryUsage, AABB count: %u, temp size: %zu MB, output size: %zu MB",
       num_prims, blas_buffer_sizes.tempSizeInBytes / 1024 / 1024,
       blas_buffer_sizes.outputSizeInBytes / 1024 / 1024);
 
   rmm::device_buffer temp_buf(blas_buffer_sizes.tempSizeInBytes, cuda_stream);
-  out_buf.resize(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
 
   if (compact) {
+    rmm::device_buffer uncompacted_buf(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
     rmm::device_scalar<uint64_t> compacted_size(cuda_stream);
     OptixAccelEmitDesc emitDesc;
     emitDesc.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
     emitDesc.result = reinterpret_cast<CUdeviceptr>(compacted_size.data());
 
-    OPTIX_CHECK(optixAccelBuild(
-        optix_context_, cuda_stream, &accelOptions, &build_input, 1,
-        reinterpret_cast<CUdeviceptr>(temp_buf.data()), blas_buffer_sizes.tempSizeInBytes,
-        reinterpret_cast<CUdeviceptr>(out_buf.data()),
-        blas_buffer_sizes.outputSizeInBytes, &traversable, &emitDesc, 1));
+    OPTIX_CHECK(optixAccelBuild(optix_context_, cuda_stream, &accelOptions, &build_input,
+                                1, reinterpret_cast<CUdeviceptr>(temp_buf.data()),
+                                blas_buffer_sizes.tempSizeInBytes,
+                                reinterpret_cast<CUdeviceptr>(uncompacted_buf.data()),
+                                uncompacted_buf.size(), &traversable, &emitDesc, 1));
 
     auto size = compacted_size.value(cuda_stream);
     out_buf.resize(size, cuda_stream);
     OPTIX_CHECK(optixAccelCompact(optix_context_, cuda_stream, traversable,
-                                  reinterpret_cast<CUdeviceptr>(out_buf.data()), size,
-                                  &traversable));
+                                  reinterpret_cast<CUdeviceptr>(out_buf.data()),
+                                  out_buf.size(), &traversable));
   } else {
+    out_buf.resize(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
+
     OPTIX_CHECK(optixAccelBuild(
         optix_context_, cuda_stream, &accelOptions, &build_input, 1,
         reinterpret_cast<CUdeviceptr>(temp_buf.data()), blas_buffer_sizes.tempSizeInBytes,
@@ -488,15 +495,14 @@ std::vector<char> RTEngine::readData(const std::string& filename) {
 }
 
 void RTEngine::releaseOptixResources() {
+  GPUSPATIAL_LOG_INFO("Release OptiX resources");
   for (auto& [id, res] : resources_) {
-    optixPipelineDestroy(res.pipeline);
-    optixProgramGroupDestroy(res.raygen_pg);
-    optixProgramGroupDestroy(res.miss_pg);
-    optixProgramGroupDestroy(res.hitgroup_pg);
-    optixModuleDestroy(res.module);
+    OPTIX_CHECK(optixPipelineDestroy(res.pipeline));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.raygen_pg));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.miss_pg));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.hitgroup_pg));
+    OPTIX_CHECK(optixModuleDestroy(res.module));
   }
-  optixDeviceContextDestroy(optix_context_);
+  OPTIX_CHECK(optixDeviceContextDestroy(optix_context_));
 }
-
-}  // namespace details
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
index 3ffdca9ea..f9a632dd3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
@@ -14,10 +14,9 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "ray_params.h"
-#include "shader_config.h"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "ray_params.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -32,17 +31,22 @@ extern "C" __global__ void __intersection__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
   using ray_params_t = gpuspatial::detail::RayParams<n_dim>;
-  auto geom1_id = optixGetPayload_0();
-  auto geom2_id = optixGetPrimitiveIndex();
-  const auto& mbr1 = params.mbrs1[geom1_id];
-  const auto& mbr2 = params.mbrs2[geom2_id];
-  const auto& aabb1 = mbr1.ToOptixAabb();
-  const auto aabb2 = mbr2.ToOptixAabb();
+  auto rect1_id = optixGetPayload_0();
+  auto rect2_id = optixGetPrimitiveIndex();
+  const auto& rect1 = params.rects1[rect1_id];
+  const auto& rect2 = params.rects2[rect2_id];
+  const auto& aabb1 = rect1.ToOptixAabb();
+  const auto aabb2 = rect2.ToOptixAabb();
   ray_params_t ray_params(aabb1, false);
 
   if (ray_params.IsHit(aabb2)) {
-    if (mbr1.intersects(mbr2)) {
-      params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+    if (rect1.intersects(rect2)) {
+      if (params.count == nullptr) {
+        auto tail = params.rect1_ids.Append(rect1_id);
+        params.rect2_ids[tail] = rect2_id;
+      } else {
+        atomicAdd(params.count, 1);
+      }
     }
   }
 }
@@ -53,20 +57,18 @@ extern "C" __global__ void __raygen__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.mbrs1.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.rects1.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& mbr1 = params.mbrs1[i];
-    auto aabb1 = mbr1.ToOptixAabb();
+    const auto& rect1 = params.rects1[i];
+    if (!rect1.valid()) continue;
+    auto aabb1 = rect1.ToOptixAabb();
     gpuspatial::detail::RayParams<n_dim> ray_params(aabb1, false);
-    float3 origin, dir;
+    float3 origin{0, 0, 0}, dir{0, 0, 0};
 
-    origin.x = ray_params.o.x;
-    origin.y = ray_params.o.y;
-    origin.z = 0;
-
-    dir.x = ray_params.d.x;
-    dir.y = ray_params.d.y;
-    dir.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = (&ray_params.o.x)[dim];
+      (&dir.x)[dim] = (&ray_params.d.x)[dim];
+    }
 
     float tmin = 0;
     float tmax = 1;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
index d85d63741..607d95649 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
@@ -14,9 +14,9 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "ray_params.h"
-#include "shader_config.h"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "ray_params.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -31,20 +31,25 @@ extern "C" __global__ void __intersection__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
   using ray_params_t = gpuspatial::detail::RayParams<n_dim>;
-  auto geom1_id = optixGetPrimitiveIndex();
-  uint64_t geom2_id = optixGetPayload_0();
-  const auto& mbr1 = params.mbrs1[geom1_id];
-  const auto& mbr2 = params.mbrs2[geom2_id];
-  const auto& aabb1 = mbr1.ToOptixAabb();
-  const auto aabb2 = mbr2.ToOptixAabb();
+  auto rect1_id = optixGetPrimitiveIndex();
+  uint64_t rect2_id = optixGetPayload_0();
+  const auto& rect1 = params.rects1[rect1_id];
+  const auto& rect2 = params.rects2[rect2_id];
+  const auto& aabb1 = rect1.ToOptixAabb();
+  const auto aabb2 = rect2.ToOptixAabb();
 
   ray_params_t ray_params(aabb2, true);
 
   if (ray_params.IsHit(aabb1)) {  // ray cast from AABB2 hits AABB1
     ray_params = ray_params_t(aabb1, false);
     if (!ray_params.IsHit(aabb2)) {  // ray cast from AABB1 does not hit AABB2
-      if (mbr1.intersects(mbr2)) {
-        params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+      if (rect1.intersects(rect2)) {
+        if (params.count == nullptr) {
+          auto tail = params.rect1_ids.Append(rect1_id);
+          params.rect2_ids[tail] = rect2_id;
+        } else {
+          atomicAdd(params.count, 1);
+        }
       }
     }
   }
@@ -56,20 +61,20 @@ extern "C" __global__ void __raygen__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.mbrs2.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.rects2.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& mbr2 = params.mbrs2[i];
-    auto aabb2 = mbr2.ToOptixAabb();
-    gpuspatial::detail::RayParams<n_dim> ray_params(aabb2, true);
-    float3 origin, dir;
+    const auto& rect2 = params.rects2[i];
+
+    if (!rect2.valid()) continue;
 
-    origin.x = ray_params.o.x;
-    origin.y = ray_params.o.y;
-    origin.z = 0;
+    auto aabb2 = rect2.ToOptixAabb();
+    gpuspatial::detail::RayParams<n_dim> ray_params(aabb2, true);
+    float3 origin{0, 0, 0}, dir{0, 0, 0};
 
-    dir.x = ray_params.d.x;
-    dir.y = ray_params.d.y;
-    dir.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = (&ray_params.o.x)[dim];
+      (&dir.x)[dim] = (&ray_params.d.x)[dim];
+    }
 
     float tmin = 0;
     float tmax = 1;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
index 56daf449a..13aac4e03 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
@@ -20,7 +20,7 @@ function(CONFIG_SHADERS SHADER_PTX_FILES)
   set(SHADER_POINT_TYPES "SHADER_POINT_FLOAT_2D;SHADER_POINT_DOUBLE_2D")
 
   set(SHADERS_DEPS "${PROJECT_SOURCE_DIR}/include/gpuspatial/geom"
-                   "${PROJECT_SOURCE_DIR}/include/gpuspatial/index/detail")
+                   "${PROJECT_SOURCE_DIR}/include/gpuspatial/rt")
 
   set(OUTPUT_DIR "${PROJECT_BINARY_DIR}/shaders_ptx")
   set(OPTIX_MODULE_EXTENSION ".ptx")
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
index f96226c69..3a5c216ba 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
@@ -14,12 +14,12 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/geom/line_segment.cuh"
-#include "gpuspatial/geom/ray_crossing_counter.cuh"
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "gpuspatial/utils/floating_point.h"
-#include "shader_config.h"
+#include "gpuspatial/geom/ray_crossing_counter.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/floating_point.hpp"
+#include "gpuspatial/utils/helpers.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -44,35 +44,36 @@ extern "C" __global__ void __intersection__gpuspatial() {
   auto point_part_id = optixGetPayload_7();
 
   const auto& multi_polygons = params.multi_polygons;
-  auto point_idx = params.ids[query_idx].first;
-  auto multi_polygon_idx = params.ids[query_idx].second;
+  auto point_idx = params.query_point_ids[query_idx];
+  auto multi_polygon_idx = params.query_multi_polygon_ids[query_idx];
   auto hit_multipolygon_idx = params.aabb_multi_poly_ids[aabb_id];
   auto hit_part_idx = params.aabb_part_ids[aabb_id];
   auto hit_ring_idx = params.aabb_ring_ids[aabb_id];
-
+  const auto& vertex_offsets = params.aabb_vertex_offsets[aabb_id];
   // the seg being hit is not from the query polygon
   if (hit_multipolygon_idx != multi_polygon_idx || hit_part_idx != part_idx ||
       hit_ring_idx != ring_idx) {
     return;
   }
 
-  uint32_t local_v1_idx = aabb_id - params.seg_begins[reordered_multi_polygon_idx];
-  uint32_t global_v1_idx = v_offset + local_v1_idx;
-  uint32_t global_v2_idx = global_v1_idx + 1;
-
-  auto vertices = multi_polygons.get_vertices();
-  // segment being hit
-  const auto& v1 = vertices[global_v1_idx];
-  const auto& v2 = vertices[global_v2_idx];
-
+  const auto& multi_polygon = multi_polygons[multi_polygon_idx];
+  const auto& polygon = multi_polygon.get_polygon(part_idx);
+  const auto& ring = polygon.get_ring(ring_idx);
   RayCrossingCounter locator(crossing_count, point_on_seg);
 
-  if (!params.points.empty()) {
-    const auto& p = params.points[point_idx];
-    locator.countSegment(p, v1, v2);
-  } else if (!params.multi_points.empty()) {
-    const auto& p = params.multi_points[point_idx].get_point(point_part_id);
-    locator.countSegment(p, v1, v2);
+  // For each segment in the AABB, count crossings
+  for (auto vertex_offset = vertex_offsets.first; vertex_offset < vertex_offsets.second;
+       ++vertex_offset) {
+    const auto& v1 = ring.get_point(vertex_offset);
+    const auto& v2 = ring.get_point(vertex_offset + 1);
+
+    if (!params.points.empty()) {
+      const auto& p = params.points[point_idx];
+      locator.countSegment(p, v1, v2);
+    } else if (!params.multi_points.empty()) {
+      const auto& p = params.multi_points[point_idx].get_point(point_part_id);
+      locator.countSegment(p, v1, v2);
+    }
   }
 
   optixSetPayload_5(locator.get_crossing_count());
@@ -82,22 +83,23 @@ extern "C" __global__ void __intersection__gpuspatial() {
 extern "C" __global__ void __raygen__gpuspatial() {
   using namespace gpuspatial;
   using point_t = gpuspatial::ShaderPointType;
-  const auto& ids = params.ids;
   const auto& multi_polygons = params.multi_polygons;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < ids.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.query_size;
        i += optixGetLaunchDimensions().x) {
-    auto point_idx = ids[i].first;
-    auto multi_polygon_idx = ids[i].second;
+    auto point_idx = params.query_point_ids[i];
+    auto multi_polygon_idx = params.query_multi_polygon_ids[i];
 
-    auto it = thrust::lower_bound(thrust::seq, params.multi_polygon_ids.begin(),
-                                  params.multi_polygon_ids.end(), multi_polygon_idx);
-    assert(it != params.multi_polygon_ids.end());
+    auto it = thrust::lower_bound(thrust::seq, params.uniq_multi_polygon_ids.begin(),
+                                  params.uniq_multi_polygon_ids.end(), multi_polygon_idx);
+    assert(it != params.uniq_multi_polygon_ids.end());
     uint32_t reordered_multi_polygon_idx =
-        thrust::distance(params.multi_polygon_ids.begin(), it);
-    assert(params.multi_polygon_ids[reordered_multi_polygon_idx] == multi_polygon_idx);
+        thrust::distance(params.uniq_multi_polygon_ids.begin(), it);
+    assert(params.uniq_multi_polygon_ids[reordered_multi_polygon_idx] ==
+           multi_polygon_idx);
 
     auto handle_point = [&](const point_t& p, uint32_t point_part_id, int& IM) {
+      assert(!p.empty());
       float3 origin;
       // each polygon takes a z-plane
       origin.x = p.x();
@@ -108,7 +110,8 @@ extern "C" __global__ void __raygen__gpuspatial() {
       const auto& mbr = multi_polygon.get_mbr();
       auto width = mbr.get_max().x() - mbr.get_min().x();
       float tmin = 0;
-      float tmax = width;
+      // ensure the floating number is greater than the double
+      float tmax = next_float_from_double(width, 1, 2);
 
       // first polygon offset
       uint32_t part_offset = multi_polygons.get_prefix_sum_geoms()[multi_polygon_idx];
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
index 93f5ceb05..c728b4aa3 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
@@ -14,8 +14,8 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "shader_config.h"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -29,51 +29,38 @@ extern "C" __constant__
 
 extern "C" __global__ void __intersection__gpuspatial() {
   auto aabb_id = optixGetPrimitiveIndex();
-  auto geom2_id = optixGetPayload_0();
-  const auto& point = params.points2[geom2_id];
-  const auto& mbrs1 = params.mbrs1;
+  auto point_id = optixGetPayload_0();
+  const auto& point = params.points[point_id];
+  const auto& rect = params.rects[aabb_id];
 
-  if (params.grouped) {
-    assert(!params.prefix_sum.empty());
-    auto begin = params.prefix_sum[aabb_id];
-    auto end = params.prefix_sum[aabb_id + 1];
-
-    for (auto offset = begin; offset < end; offset++) {
-      auto geom1_id = params.reordered_indices[offset];
-      if (mbrs1.empty()) {
-        params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
-      } else {
-        const auto& mbr1 = mbrs1[geom1_id];
-
-        if (mbr1.covers(point.as_float())) {
-          params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
-        }
-      }
-    }
-  } else {
-    assert(!mbrs1.empty());
-    auto geom1_id = aabb_id;
-    const auto& mbr1 = mbrs1[geom1_id];
-
-    if (mbr1.covers(point.as_float())) {
-      params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+  if (rect.covers(point)) {
+    if (params.count == nullptr) {
+      auto tail = params.rect_ids.Append(aabb_id);
+      params.point_ids[tail] = point_id;
+    } else {
+      atomicAdd(params.count, 1);
     }
   }
 }
 
 extern "C" __global__ void __raygen__gpuspatial() {
+  using point_t = gpuspatial::ShaderPointType;
+  constexpr int n_dim = point_t::n_dim;
   float tmin = 0;
   float tmax = FLT_MIN;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.points2.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.points.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& p = params.points2[i];
+    const auto& p = params.points[i];
+    if (p.empty()) {
+      continue;
+    }
 
-    float3 origin;
+    float3 origin{0, 0, 0};
 
-    origin.x = p.get_coordinate(0);
-    origin.y = p.get_coordinate(1);
-    origin.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = p.get_coordinate(dim);
+    }
     float3 dir = {0, 0, 1};
 
     optixTrace(params.handle, origin, dir, tmin, tmax, 0, OptixVisibilityMask(255),
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
index 97cb948d1..05066d793 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
@@ -14,11 +14,11 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/geom/line_segment.cuh"
-#include "gpuspatial/geom/ray_crossing_counter.cuh"
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "shader_config.h"
+#include "gpuspatial/geom/ray_crossing_counter.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/helpers.cuh"
+#include "shader_config.hpp"
 
 #include <cuda_runtime.h>
 #include <optix_device.h>
@@ -41,32 +41,34 @@ extern "C" __global__ void __intersection__gpuspatial() {
   auto point_on_seg = optixGetPayload_5();
   auto point_part_id = optixGetPayload_6();
   const auto& polygons = params.polygons;
-  auto point_idx = params.ids[query_idx].first;
-  auto polygon_idx = params.ids[query_idx].second;
+  auto point_idx = params.query_point_ids[query_idx];
+  auto polygon_idx = params.query_polygon_ids[query_idx];
   auto hit_polygon_idx = params.aabb_poly_ids[aabb_id];
   auto hit_ring_idx = params.aabb_ring_ids[aabb_id];
+  const auto& vertex_offsets = params.aabb_vertex_offsets[aabb_id];
   // the seg being hit is not from the query polygon
   if (hit_polygon_idx != polygon_idx || hit_ring_idx != ring_idx) {
     return;
   }
 
-  uint32_t local_v1_idx = aabb_id - params.seg_begins[reordered_polygon_idx];
-  uint32_t global_v1_idx = v_offset + local_v1_idx;
-  uint32_t global_v2_idx = global_v1_idx + 1;
+  auto ring = polygons[polygon_idx].get_ring(ring_idx);
+  RayCrossingCounter locator(crossing_count, point_on_seg);
 
-  auto vertices = polygons.get_vertices();
-  // segment being hit
-  const auto& v1 = vertices[global_v1_idx];
-  const auto& v2 = vertices[global_v2_idx];
+  // For each segment in the AABB, count crossings
+  for (auto vertex_offset = vertex_offsets.first; vertex_offset < vertex_offsets.second;
+       ++vertex_offset) {
+    const auto& v1 = ring.get_point(vertex_offset);
+    const auto& v2 = ring.get_point(vertex_offset + 1);
 
-  RayCrossingCounter locator(crossing_count, point_on_seg);
-  if (!params.points.empty()) {
-    const auto& p = params.points[point_idx];
-    locator.countSegment(p, v1, v2);
-  } else if (!params.multi_points.empty()) {
-    const auto& p = params.multi_points[point_idx].get_point(point_part_id);
-    locator.countSegment(p, v1, v2);
+    if (!params.points.empty()) {
+      const auto& p = params.points[point_idx];
+      locator.countSegment(p, v1, v2);
+    } else if (!params.multi_points.empty()) {
+      const auto& p = params.multi_points[point_idx].get_point(point_part_id);
+      locator.countSegment(p, v1, v2);
+    }
   }
+
   optixSetPayload_4(locator.get_crossing_count());
   optixSetPayload_5(locator.get_point_on_segment());
 }
@@ -74,32 +76,30 @@ extern "C" __global__ void __intersection__gpuspatial() {
 extern "C" __global__ void __raygen__gpuspatial() {
   using namespace gpuspatial;
   using point_t = gpuspatial::ShaderPointType;
-  const auto& ids = params.ids;
   const auto& polygons = params.polygons;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < ids.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.query_size;
        i += optixGetLaunchDimensions().x) {
-    auto point_idx = ids[i].first;
-    auto polygon_idx = ids[i].second;
+    auto point_idx = params.query_point_ids[i];
+    auto polygon_idx = params.query_polygon_ids[i];
 
-    auto it = thrust::lower_bound(thrust::seq, params.polygon_ids.begin(),
-                                  params.polygon_ids.end(), polygon_idx);
-    assert(it != params.polygon_ids.end());
-    uint32_t reordered_polygon_idx = thrust::distance(params.polygon_ids.begin(), it);
-    assert(params.polygon_ids[reordered_polygon_idx] == polygon_idx);
+    auto it = thrust::lower_bound(thrust::seq, params.uniq_polygon_ids.begin(),
+                                  params.uniq_polygon_ids.end(), polygon_idx);
+    assert(it != params.uniq_polygon_ids.end());
+    uint32_t reordered_polygon_idx =
+        thrust::distance(params.uniq_polygon_ids.begin(), it);
+    assert(params.uniq_polygon_ids[reordered_polygon_idx] == polygon_idx);
 
     auto handle_point = [&](const point_t& p, uint32_t point_part_id, int& IM) {
-      float3 origin;
-      // each polygon takes a z-plane
-      origin.x = p.x();
-      origin.y = p.y();
+      assert(!p.empty());
       // cast ray toward positive x-axis
       float3 dir = {1, 0, 0};
       const auto& polygon = polygons[polygon_idx];
       const auto& mbr = polygon.get_mbr();
       auto width = mbr.get_max().x() - mbr.get_min().x();
       float tmin = 0;
-      float tmax = width;
+      // ensure the floating number is greater than the double
+      float tmax = next_float_from_double(width, 1, 2);
 
       // first polygon offset
       uint32_t ring_offset = polygons.get_prefix_sum_polygons()[polygon_idx];
@@ -119,7 +119,11 @@ extern "C" __global__ void __raygen__gpuspatial() {
       IM |= IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D;
       uint32_t ring = 0;
       locator.Init();
-      origin.z = reordered_polygon_idx;
+      float3 origin;
+      // each polygon takes a z-plane
+      origin.x = p.x();
+      origin.y = p.y();
+      origin.z = polygon_idx;
       // test exterior
       optixTrace(params.handle, origin, dir, tmin, tmax, 0, OptixVisibilityMask(255),
                  OPTIX_RAY_FLAG_NONE,             // OPTIX_RAY_FLAG_NONE,
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.h b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.cuh
similarity index 95%
rename from c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.h
rename to c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.cuh
index 447590a26..1e920400b 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/ray_params.cuh
@@ -17,9 +17,9 @@
 
 #pragma once
 
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/cuda_utils.h"
+#include "gpuspatial/geom/box.hpp"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/utils/cuda_utils.hpp"
 
 #include <optix.h>
 #include <thrust/swap.h>
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.h b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.hpp
similarity index 100%
rename from c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.h
rename to c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/shader_config.hpp
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
new file mode 100644
index 000000000..9f76af495
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
@@ -0,0 +1,682 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "gpuspatial/rt/launch_parameters.cuh"
+#include "gpuspatial/utils/launcher.hpp"
+#include "gpuspatial/utils/logger.hpp"
+#include "gpuspatial/utils/morton_code.hpp"
+#include "gpuspatial/utils/stopwatch.hpp"
+
+#include "rt/shaders/shader_id.hpp"
+
+#include "rmm/exec_policy.hpp"
+
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#define OPTIX_MAX_RAYS (1lu << 30)
+
+namespace gpuspatial {
+namespace detail {
+
+template <typename POINT_T>
+static rmm::device_uvector<OptixAabb> ComputeAABBs(rmm::cuda_stream_view stream,
+                                                   const ArrayView<Box<POINT_T>>& mbrs) {
+  rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
+                    aabbs.begin(), [] __device__(const Box<POINT_T>& mbr) {
+                      // handle empty boxes
+                      if (mbr.get_min().empty() || mbr.get_max().empty()) {
+                        // empty box
+                        OptixAabb empty_aabb;
+                        empty_aabb.minX = empty_aabb.minY = empty_aabb.minZ = 0.0f;
+                        empty_aabb.maxX = empty_aabb.maxY = empty_aabb.maxZ = -1.0f;
+                        return empty_aabb;
+                      }
+                      return mbr.ToOptixAabb();
+                    });
+  return std::move(aabbs);
+}
+
+template <typename POINT_T, typename INDEX_T>
+rmm::device_uvector<OptixAabb> ComputeAABBs(
+    rmm::cuda_stream_view stream, rmm::device_uvector<POINT_T>& points,
+    rmm::device_uvector<INDEX_T>& prefix_sum,
+    rmm::device_uvector<INDEX_T>& reordered_indices, int group_size,
+    rmm::device_uvector<Box<POINT_T>>& mbrs) {
+  using scalar_t = typename POINT_T::scalar_t;
+  using box_t = Box<POINT_T>;
+  constexpr int n_dim = POINT_T::n_dim;
+  static_assert(n_dim == 2 || n_dim == 3, "Only 2D and 3D points are supported");
+  POINT_T min_world_corner, max_world_corner;
+
+  min_world_corner.set_max();
+  max_world_corner.set_min();
+
+  for (int dim = 0; dim < n_dim; dim++) {
+    auto min_val = thrust::transform_reduce(
+        rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+        [=] __device__(const POINT_T& p) -> scalar_t { return p.get_coordinate(dim); },
+        std::numeric_limits<scalar_t>::max(), thrust::minimum<scalar_t>());
+    auto max_val = thrust::transform_reduce(
+        rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+        [=] __device__(const POINT_T& p) -> scalar_t { return p.get_coordinate(dim); },
+        std::numeric_limits<scalar_t>::lowest(), thrust::maximum<scalar_t>());
+    min_world_corner.set_coordinate(dim, min_val);
+    max_world_corner.set_coordinate(dim, max_val);
+  }
+
+  auto np = points.size();
+  rmm::device_uvector<uint32_t> morton_codes(np, stream);
+  // compute morton codes and reorder indices
+  thrust::transform(rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+                    morton_codes.begin(), [=] __device__(const POINT_T& p) {
+                      POINT_T norm_p;
+
+                      for (int dim = 0; dim < n_dim; dim++) {
+                        auto min_val = min_world_corner.get_coordinate(dim);
+                        auto max_val = max_world_corner.get_coordinate(dim);
+                        auto extent = min_val == max_val ? 1 : max_val - min_val;
+                        auto norm_val = (p.get_coordinate(dim) - min_val) / extent;
+                        norm_p.set_coordinate(dim, norm_val);
+                      }
+                      return detail::morton_code(norm_p.get_vec());
+                    });
+  reordered_indices.resize(np, stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), reordered_indices.begin(),
+                   reordered_indices.end());
+  thrust::sort_by_key(rmm::exec_policy_nosync(stream), morton_codes.begin(),
+                      morton_codes.end(), reordered_indices.begin());
+  auto n_aabbs = (np + group_size - 1) / group_size;
+  mbrs.resize(n_aabbs, stream);
+  rmm::device_uvector<OptixAabb> aabbs(n_aabbs, stream);
+  rmm::device_uvector<INDEX_T> np_per_aabb(n_aabbs, stream);
+
+  auto* p_reordered_indices = reordered_indices.data();
+  auto* p_aabbs = aabbs.data();
+  auto* p_np_per_aabb = np_per_aabb.data();
+  ArrayView<POINT_T> v_points(points);
+  ArrayView<box_t> v_mbrs(mbrs);
+  // each warp takes an AABB and processes points_per_aabb points
+  LaunchKernel(stream, [=] __device__() mutable {
+    using WarpReduce = cub::WarpReduce<scalar_t>;
+    // One temp storage slot per active warp
+    __shared__ typename WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    const int warp_id = threadIdx.x / 32;
+    const int lane_id = threadIdx.x % 32;
+    // Calculate global ID of the warp to stride through AABBs
+    const int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    const int total_warps = (gridDim.x * blockDim.x) / 32;
+
+    // Grid-Stride Loop: Each warp processes one AABB (one group of points)
+    for (uint32_t aabb_id = global_warp_id; aabb_id < n_aabbs; aabb_id += total_warps) {
+      INDEX_T idx_begin = aabb_id * group_size;
+      INDEX_T idx_end = thrust::min((INDEX_T)np, (INDEX_T)(idx_begin + group_size));
+      int count = idx_end - idx_begin;
+
+      // 1. Initialize Thread-Local Accumulators (Registers)
+      // Initialize to limits so empty/out-of-bounds threads don't affect reduction
+      scalar_t thread_min[n_dim];
+      scalar_t thread_max[n_dim];
+
+#pragma unroll
+      for (int d = 0; d < n_dim; d++) {
+        thread_min[d] = std::numeric_limits<scalar_t>::max();
+        thread_max[d] = std::numeric_limits<scalar_t>::lowest();
+      }
+
+      // 2. Loop over the points in the group (Stride by 32)
+      // Every thread processes roughly group_size/32 points
+      for (int i = lane_id; i < count; i += 32) {
+        // Load index (Coalesced access to indices)
+        INDEX_T point_idx = p_reordered_indices[idx_begin + i];
+
+        // Load Point (Indirect access - unavoidable due to reordering)
+        const POINT_T& p = v_points[point_idx];
+
+// Accumulate min/max locally in registers
+#pragma unroll
+        for (int d = 0; d < n_dim; d++) {
+          scalar_t val = p.get_coordinate(d);
+          thread_min[d] = thrust::min(thread_min[d], val);
+          thread_max[d] = thrust::max(thread_max[d], val);
+        }
+      }
+
+      // 3. Warp Reduction (Perform once per dimension per AABB)
+      POINT_T final_min, final_max;
+#pragma unroll
+      for (int d = 0; d < n_dim; d++) {
+        // CUB WarpReduce handles the cross-lane communication
+        scalar_t agg_min =
+            WarpReduce(temp_storage[warp_id]).Reduce(thread_min[d], thrust::minimum<>());
+        scalar_t agg_max =
+            WarpReduce(temp_storage[warp_id]).Reduce(thread_max[d], thrust::maximum<>());
+
+        // Only lane 0 holds the valid reduction result
+        if (lane_id == 0) {
+          final_min.set_coordinate(d, agg_min);
+          final_max.set_coordinate(d, agg_max);
+        }
+      }
+
+      // 4. Store Results to Global Memory
+      if (lane_id == 0) {
+        p_np_per_aabb[aabb_id] = count;
+
+        if (count > 0) {
+          box_t ext_mbr(final_min, final_max);
+          v_mbrs[aabb_id] = ext_mbr;
+          p_aabbs[aabb_id] = ext_mbr.ToOptixAabb();
+        } else {
+          // Handle empty AABB case
+          OptixAabb empty_aabb;
+          empty_aabb.minX = empty_aabb.minY = empty_aabb.minZ = 0.0f;
+          empty_aabb.maxX = empty_aabb.maxY = empty_aabb.maxZ = -1.0f;
+          v_mbrs[aabb_id] = box_t();
+          p_aabbs[aabb_id] = empty_aabb;
+        }
+      }
+    }
+  });
+  prefix_sum.resize(n_aabbs + 1, stream);
+  prefix_sum.set_element_to_zero_async(0, stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), np_per_aabb.begin(),
+                         np_per_aabb.end(), prefix_sum.begin() + 1);
+#ifndef NDEBUG
+  auto* p_prefix_sum = prefix_sum.data();
+
+  thrust::for_each(rmm::exec_policy_nosync(stream), thrust::counting_iterator<size_t>(0),
+                   thrust::counting_iterator<size_t>(aabbs.size()),
+                   [=] __device__(size_t aabb_idx) {
+                     auto begin = p_prefix_sum[aabb_idx];
+                     auto end = p_prefix_sum[aabb_idx + 1];
+                     const auto& aabb = p_aabbs[aabb_idx];
+
+                     for (auto i = begin; i < end; i++) {
+                       auto point_idx = p_reordered_indices[i];
+                       const auto& p = v_points[point_idx];
+                       for (int dim = 0; dim < n_dim; dim++) {
+                         auto coord = p.get_coordinate(dim);
+                         assert(coord >= (&aabb.minX)[dim] && coord <= (&aabb.maxX)[dim]);
+                         assert(v_mbrs[aabb_idx].covers(p));
+                       }
+                     }
+                   });
+#endif
+  return std::move(aabbs);
+}
+
+template <typename POINT_T, typename INDEX_T>
+void RefineExactPoints(rmm::cuda_stream_view stream, ArrayView<POINT_T> build_points,
+                       ArrayView<POINT_T> probe_points, ArrayView<INDEX_T> prefix_sum,
+                       ArrayView<INDEX_T> reordered_indices, ArrayView<INDEX_T> rect_ids,
+                       ArrayView<INDEX_T> point_ids, Queue<INDEX_T>& build_indices,
+                       ArrayView<INDEX_T> probe_indices) {
+  auto d_queue = build_indices.DeviceObject();
+
+  LaunchKernel(stream, [=] __device__() mutable {
+    auto lane_id = threadIdx.x % 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (uint32_t i = global_warp_id; i < rect_ids.size(); i += n_warps) {
+      auto rect_id = rect_ids[i];
+      auto point_id = point_ids[i];
+      auto build_point_begin = prefix_sum[rect_id];
+      auto build_point_end = prefix_sum[rect_id + 1];
+
+      for (uint32_t j = lane_id + build_point_begin; j < build_point_end;
+           j += WARP_SIZE) {
+        auto build_point_id = reordered_indices[j];
+        const auto& build_point = build_points[build_point_id];
+        const auto& probe_point = probe_points[point_id];
+        if (build_point == probe_point) {
+          auto tail = d_queue.Append(build_point_id);
+          probe_indices[tail] = point_id;
+        }
+      }
+    }
+  });
+}
+}  // namespace detail
+
+template <typename SCALAR_T, int N_DIM>
+RTSpatialIndex<SCALAR_T, N_DIM>::RTSpatialIndex(const RTSpatialIndexConfig& config)
+    : config_(config),
+      stream_pool_(std::make_unique<rmm::cuda_stream_pool>(config_.concurrency)),
+      indexing_points_(false),
+      handle_(0) {}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::Clear() {
+  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), Clear", this,
+                      rmm::available_device_memory().first / 1024 / 1024);
+  auto stream = rmm::cuda_stream_default;
+  bvh_buffer_.resize(0, stream);
+  bvh_buffer_.shrink_to_fit(stream);
+  rects_.resize(0, stream);
+  rects_.shrink_to_fit(stream);
+  points_.resize(0, stream);
+  points_.shrink_to_fit(stream);
+  stream.synchronize();
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::PushBuild(const box_t* rects, uint32_t n_rects) {
+  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), PushBuild, rectangles %zu", this,
+                      rmm::available_device_memory().first / 1024 / 1024, n_rects);
+  if (n_rects == 0) return;
+  auto stream = rmm::cuda_stream_default;
+  auto prev_size = rects_.size();
+
+  rects_.resize(rects_.size() + n_rects, stream);
+  CUDA_CHECK(cudaMemcpyAsync(rects_.data() + prev_size, rects, sizeof(box_t) * n_rects,
+                             cudaMemcpyHostToDevice, stream));
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::FinishBuilding() {
+  auto stream = rmm::cuda_stream_default;
+
+  indexing_points_ = thrust::all_of(rmm::exec_policy_nosync(stream), rects_.begin(),
+                                    rects_.end(), [] __device__(const box_t& box) {
+                                      bool is_point = true;
+                                      for (int dim = 0; dim < n_dim; dim++) {
+                                        is_point &= box.get_min(dim) == box.get_max(dim);
+                                      }
+                                      return is_point;
+                                    });
+
+  rmm::device_uvector<OptixAabb> aabbs{0, stream};
+  if (indexing_points_) {
+    points_.resize(rects_.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), rects_.begin(), rects_.end(),
+                      points_.begin(),
+                      [] __device__(const box_t& box) { return box.get_min(); });
+    aabbs = std::move(detail::ComputeAABBs(stream, points_, point_ranges_,
+                                           reordered_point_indices_,
+                                           config_.n_points_per_aabb, rects_));
+  } else {
+    aabbs = std::move(detail::ComputeAABBs(stream, ArrayView<box_t>(rects_)));
+  }
+
+  handle_ = config_.rt_engine->BuildAccelCustom(stream, ArrayView<OptixAabb>(aabbs),
+                                                bvh_buffer_, config_.prefer_fast_build,
+                                                config_.compact);
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), FinishBuilding Index on %s, Total geoms: %zu",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      indexing_points_ ? "Points" : "Rectangles", numGeometries());
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* rects, uint32_t n_rects,
+                                            std::vector<uint32_t>* build_indices,
+                                            std::vector<uint32_t>* probe_indices) {
+  // Formulating point and box queries into ray tracing queries:
+  // Reference: "Geng L, Lee R, Zhang X. LibRTS: A Spatial Indexing Library by Ray
+  // Tracing. InProceedings of the 30th ACM SIGPLAN Annual Symposium on Principles and
+  // Practice of Parallel Programming 2025"
+  if (n_rects == 0) return;
+  SpatialIndexContext ctx;
+  auto stream = stream_pool_->get_stream();
+  rmm::device_uvector<box_t> d_rects(n_rects, stream);
+  rmm::device_uvector<point_t> d_points{0, stream};
+
+  CUDA_CHECK(cudaMemcpyAsync(d_rects.data(), rects, sizeof(box_t) * n_rects,
+                             cudaMemcpyHostToDevice, stream));
+
+  bool probe_points = thrust::all_of(rmm::exec_policy_nosync(stream), d_rects.begin(),
+                                     d_rects.end(), [] __device__(const box_t& box) {
+                                       bool is_point = true;
+                                       for (int dim = 0; dim < n_dim; dim++) {
+                                         is_point &= box.get_min(dim) == box.get_max(dim);
+                                       }
+                                       return is_point;
+                                     });
+
+  if (probe_points) {
+    d_points.resize(d_rects.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
+                      d_points.begin(),
+                      [] __device__(const box_t& box) { return box.get_min(); });
+    d_rects.resize(0, stream);
+    d_rects.shrink_to_fit(stream);
+
+  } else {
+    // Build a BVH over the MBRs of the stream geometries
+#ifdef GPUSPATIAL_PROFILING
+    ctx.timer.start(stream);
+#endif
+    rmm::device_uvector<OptixAabb> aabbs(n_rects, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
+                      aabbs.begin(),
+                      [] __device__(const box_t& mbr) { return mbr.ToOptixAabb(); });
+    ctx.handle = config_.rt_engine->BuildAccelCustom(
+        stream, ArrayView<OptixAabb>(aabbs), ctx.bvh_buffer, config_.prefer_fast_build,
+        config_.compact);
+#ifdef GPUSPATIAL_PROFILING
+    ctx.bvh_build_ms = ctx.timer.stop(stream);
+#endif
+  }
+
+  ctx.counter = std::make_unique<rmm::device_scalar<uint32_t>>(0, stream);
+
+  bool swap_ids = false;
+
+  auto query = [&](bool counting) {
+#ifdef GPUSPATIAL_PROFILING
+    ctx.timer.start(stream);
+#endif
+    if (indexing_points_) {
+      if (probe_points) {
+        handleBuildPoint(ctx, ArrayView<point_t>(d_points), counting);
+      } else {
+        handleBuildPoint(ctx, ArrayView<box_t>(d_rects), counting);
+        swap_ids = true;
+      }
+    } else {
+      if (probe_points) {
+        handleBuildBox(ctx, ArrayView<point_t>(d_points), counting);
+      } else {
+        handleBuildBox(ctx, ArrayView<box_t>(d_rects), counting);
+      }
+    }
+#ifdef GPUSPATIAL_PROFILING
+    ctx.rt_ms += ctx.timer.stop(stream);
+#endif
+  };
+
+  // first pass: counting
+  query(true /* counting */);
+
+  auto cap = ctx.counter->value(stream);
+  if (cap == 0) {
+    return;
+  }
+  allocateResultBuffer(ctx, cap);
+  // second pass: retrieve results
+  query(false /* counting */);
+
+  auto result_size = ctx.build_indices.size(stream);
+  ArrayView<index_t> v_build_indices(ctx.build_indices.data(), result_size);
+  ArrayView<index_t> v_probe_indices(ctx.probe_indices.data(), result_size);
+
+  if (swap_ids) {
+    // IMPORTANT: In this case, the BVH is built on probe side and points are
+    // cast on the build side, so the result pairs are (probe_id, build_id) instead of
+    // (build_id, probe_id). We need to swap the output buffers to correct this.
+    std::swap(v_build_indices, v_probe_indices);
+  }
+
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw;
+  sw.start();
+#endif
+  build_indices->resize(result_size);
+  CUDA_CHECK(cudaMemcpyAsync(build_indices->data(), v_build_indices.data(),
+                             sizeof(index_t) * result_size, cudaMemcpyDeviceToHost,
+                             stream));
+
+  probe_indices->resize(result_size);
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices->data(), v_probe_indices.data(),
+                             sizeof(index_t) * result_size, cudaMemcpyDeviceToHost,
+                             stream));
+  stream.synchronize();
+#ifdef GPUSPATIAL_PROFILING
+  sw.stop();
+  ctx.copy_res_ms = sw.ms();
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Probe %s, Size: %zu, Results: %zu, Alloc: %.2f ms, BVH Build: %.2f ms, RT: %.2f ms, Copy res: %.2f ms",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      probe_points ? "Points" : "Rectangles",
+      probe_points ? d_points.size() : d_rects.size(), build_indices->size(),
+      ctx.alloc_ms, ctx.bvh_build_ms, ctx.rt_ms, ctx.copy_res_ms);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildPoint(SpatialIndexContext& ctx,
+                                                       ArrayView<point_t> points,
+                                                       bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = ArrayView<box_t>(rects_);
+  launch_params.points = points;
+  launch_params.handle = handle_;
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points.size());
+
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+
+    CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                               sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                               ctx.stream));
+
+    filter(ctx, dim_x);
+  } else {
+    auto cap = ctx.build_indices.capacity();
+    Queue<index_t> rect_ids;
+    rmm::device_uvector<index_t> point_ids(cap, ctx.stream);
+
+    rect_ids.Init(ctx.stream, cap);
+
+    launch_params.count = nullptr;
+    launch_params.rect_ids = rect_ids.DeviceObject();
+    launch_params.point_ids = ArrayView<index_t>(point_ids);
+
+    CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                               sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                               ctx.stream));
+
+    filter(ctx, dim_x);
+
+    detail::RefineExactPoints<point_t, index_t>(
+        ctx.stream, ArrayView<point_t>(points_), points,
+        ArrayView<index_t>(point_ranges_), ArrayView<index_t>(reordered_point_indices_),
+        ArrayView<index_t>(rect_ids.data(), rect_ids.size(ctx.stream)),
+        ArrayView<index_t>(point_ids), ctx.build_indices,
+        ArrayView<index_t>(ctx.probe_indices));
+  }
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildPoint(SpatialIndexContext& ctx,
+                                                       ArrayView<box_t> rects,
+                                                       bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = rects;
+  launch_params.points = ArrayView<point_t>(points_);
+  launch_params.handle = ctx.handle;
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect_ids = ctx.build_indices.DeviceObject();
+    launch_params.point_ids = ArrayView<index_t>(ctx.probe_indices);
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points_.size());
+
+  filter(ctx, dim_x);
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildBox(SpatialIndexContext& ctx,
+                                                     ArrayView<point_t> points,
+                                                     bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = ArrayView<box_t>(rects_);
+  launch_params.points = points;
+  launch_params.handle = handle_;
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect_ids = ctx.build_indices.DeviceObject();
+    launch_params.point_ids =
+        ArrayView<index_t>(ctx.probe_indices.data(), ctx.probe_indices.size());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points.size());
+
+  filter(ctx, dim_x);
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildBox(SpatialIndexContext& ctx,
+                                                     ArrayView<box_t> rects,
+                                                     bool counting) const {
+  // forward cast: cast rays from stream geometries with the BVH of build geometries
+  {
+    auto dim_x = std::min(OPTIX_MAX_RAYS, rects.size());
+
+    prepareLaunchParamsBoxQuery(ctx, rects, true /* forward */, counting);
+    filter(ctx, dim_x);
+  }
+  // backward cast: cast rays from the build geometries with the BVH of stream geometries
+  {
+    auto dim_x = std::min(OPTIX_MAX_RAYS, rects_.size());
+
+    prepareLaunchParamsBoxQuery(ctx, rects, false /* forward */, counting);
+    filter(ctx, dim_x);
+  }
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::allocateResultBuffer(SpatialIndexContext& ctx,
+                                                           uint32_t capacity) const {
+#ifdef GPUSPATIAL_PROFILING
+  ctx.timer.start(ctx.stream);
+#endif
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Allocate result buffer, memory consumption %zu MB, capacity %u",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      (uint64_t)capacity * 2 * sizeof(index_t) / 1024 / 1024, capacity);
+
+  ctx.build_indices.Init(ctx.stream, capacity);
+  ctx.probe_indices.resize(capacity, ctx.stream);
+#ifdef GPUSPATIAL_PROFILING
+  ctx.alloc_ms += ctx.timer.stop(ctx.stream);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::prepareLaunchParamsBoxQuery(
+    SpatialIndexContext& ctx, ArrayView<box_t> probe_rects, bool forward,
+    bool counting) const {
+  using launch_params_t = detail::LaunchParamsBoxQuery<point_t>;
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects1 = ArrayView<box_t>(rects_);
+  launch_params.rects2 = probe_rects;
+
+  if (forward) {
+    launch_params.handle = handle_;
+    ctx.shader_id = GetBoxQueryForwardShaderId<point_t>();
+  } else {
+    launch_params.handle = ctx.handle;
+    ctx.shader_id = GetBoxQueryBackwardShaderId<point_t>();
+  }
+
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect1_ids = ctx.build_indices.DeviceObject();
+    launch_params.rect2_ids = ArrayView<index_t>(ctx.probe_indices);
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::filter(SpatialIndexContext& ctx,
+                                             uint32_t dim_x) const {
+#ifdef GPUSPATIAL_PROFILING
+  ctx.timer.start(ctx.stream);
+#endif
+  if (dim_x > 0) {
+    config_.rt_engine->Render(ctx.stream, ctx.shader_id, dim3{dim_x, 1, 1},
+                              ArrayView<char>((char*)ctx.launch_params_buffer.data(),
+                                              ctx.launch_params_buffer.size()));
+  }
+#ifdef GPUSPATIAL_PROFILING
+  ctx.rt_ms += ctx.timer.stop(ctx.stream);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+std::unique_ptr<SpatialIndex<SCALAR_T, N_DIM>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config) {
+  auto index = std::make_unique<RTSpatialIndex<SCALAR_T, N_DIM>>(config);
+  GPUSPATIAL_LOG_INFO(
+      "Create RTSpatialIndex %p, fast_build = %d, compact = %d, concurrency = %d",
+      index.get(), config.prefer_fast_build, config.compact, config.concurrency);
+  return std::move(index);
+}
+
+template std::unique_ptr<SpatialIndex<float, 2>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<float, 3>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<double, 2>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<double, 3>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
new file mode 100644
index 000000000..af74e688a
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
@@ -0,0 +1,548 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.cuh"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/utils/logger.hpp"
+
+#include "rt/shaders/shader_id.hpp"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/exec_policy.hpp"
+
+
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include <future>
+#include <locale>
+#include <numeric>
+#include <vector>
+
+#define OPTIX_MAX_RAYS (1lu << 30)
+
+namespace gpuspatial {
+
+namespace detail {
+template <typename INDEX_IT>
+void ReorderIndices(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                    INDEX_IT index_end,
+                    rmm::device_uvector<uint32_t>& sorted_uniq_indices,
+                    rmm::device_uvector<uint32_t>& reordered_indices) {
+  auto sorted_begin = sorted_uniq_indices.begin();
+  auto sorted_end = sorted_uniq_indices.end();
+  thrust::transform(rmm::exec_policy_nosync(stream), index_begin, index_end,
+                    reordered_indices.begin(), [=] __device__(uint32_t val) {
+                      auto it =
+                          thrust::lower_bound(thrust::seq, sorted_begin, sorted_end, val);
+                      return thrust::distance(sorted_begin, it);
+                    });
+}
+
+template <typename LoaderT, typename DeviceGeomT>
+struct PipelineSlot {
+  rmm::cuda_stream_view stream;
+  std::unique_ptr<LoaderT> loader;
+  std::future<DeviceGeomT> prep_future;
+
+  RTSpatialRefiner::IndicesMap indices_map;
+
+  // These will be moved out after every batch
+  rmm::device_uvector<uint32_t> d_batch_build_indices;
+  rmm::device_uvector<uint32_t> d_batch_probe_indices;
+
+  PipelineSlot(rmm::cuda_stream_view s, const std::shared_ptr<ThreadPool>& tp,
+               typename LoaderT::Config config)
+      : stream(s), d_batch_build_indices(0, s), d_batch_probe_indices(0, s) {
+    loader = std::make_unique<LoaderT>(tp);
+    loader->Init(config);
+  }
+};
+}  // namespace detail
+
+RTSpatialRefiner::RTSpatialRefiner(const RTSpatialRefinerConfig& config)
+    : config_(config) {
+  thread_pool_ = std::make_shared<ThreadPool>(config_.parsing_threads);
+  stream_pool_ = std::make_unique<rmm::cuda_stream_pool>(config_.concurrency);
+  CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, config_.stack_size_bytes));
+  wkb_loader_ = std::make_unique<loader_t>(thread_pool_);
+
+  ParallelWkbLoader<point_t, index_t>::Config loader_config;
+
+  loader_config.memory_quota = config_.wkb_parser_memory_quota;
+
+  wkb_loader_->Init(loader_config);
+}
+
+void RTSpatialRefiner::Clear() {
+  auto stream = rmm::cuda_stream_default;
+  wkb_loader_->Clear(stream);
+  build_geometries_.Clear(stream);
+}
+
+void RTSpatialRefiner::PushBuild(const ArrowSchema* build_schema,
+                                 const ArrowArray* build_array) {
+  auto stream = rmm::cuda_stream_default;
+
+  wkb_loader_->Parse(stream, build_schema, build_array, 0, build_array->length);
+}
+
+void RTSpatialRefiner::FinishBuilding() {
+  auto stream = rmm::cuda_stream_default;
+  build_geometries_ = std::move(wkb_loader_->Finish(stream));
+}
+
+uint32_t RTSpatialRefiner::Refine(const ArrowSchema* probe_schema,
+                                  const ArrowArray* probe_array, Predicate predicate,
+                                  uint32_t* build_indices, uint32_t* probe_indices,
+                                  uint32_t len) {
+  if (len == 0) {
+    return 0;
+  }
+
+  if (config_.pipeline_batches > 1) {
+    return RefinePipelined(probe_schema, probe_array, predicate, build_indices,
+                           probe_indices, len);
+  }
+
+  SpatialRefinerContext ctx;
+  ctx.cuda_stream = stream_pool_->get_stream();
+
+  IndicesMap probe_indices_map;
+  rmm::device_uvector<uint32_t> d_probe_indices(len, ctx.cuda_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_probe_indices.data(), probe_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             ctx.cuda_stream));
+
+  buildIndicesMap(ctx.cuda_stream, d_probe_indices.begin(), d_probe_indices.end(),
+                  probe_indices_map);
+
+  loader_t loader(thread_pool_);
+  loader_t::Config loader_config;
+  loader_config.memory_quota = config_.wkb_parser_memory_quota / config_.concurrency;
+
+  loader.Init(loader_config);
+  loader.Parse(ctx.cuda_stream, probe_schema, probe_array,
+               probe_indices_map.h_uniq_indices.begin(),
+               probe_indices_map.h_uniq_indices.end());
+  auto probe_geoms = std::move(loader.Finish(ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Loaded Geometries, ProbeArray %ld, Loaded %u, Type %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, probe_array->length,
+      probe_geoms.num_features(),
+      GeometryTypeToString(probe_geoms.get_geometry_type()).c_str());
+
+  RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
+                                               config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+
+  re_config.memory_quota = config_.relate_engine_memory_quota / config_.concurrency;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+
+  relate_engine.set_config(re_config);
+
+  rmm::device_uvector<uint32_t> d_build_indices(len, ctx.cuda_stream);
+  CUDA_CHECK(cudaMemcpyAsync(d_build_indices.data(), build_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Evaluating %u Geometry Pairs with Predicate %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, len,
+      PredicateToString(predicate));
+
+  ctx.timer.start(ctx.cuda_stream);
+  relate_engine.Evaluate(ctx.cuda_stream, probe_geoms, predicate, d_build_indices,
+                         probe_indices_map.d_reordered_indices);
+  float refine_ms = ctx.timer.stop(ctx.cuda_stream);
+  auto new_size = d_build_indices.size();
+
+  GPUSPATIAL_LOG_INFO("RTSpatialRefiner %p (Free %zu MB), Refine time %f, new size %zu",
+                      this, rmm::available_device_memory().first / 1024 / 1024, refine_ms,
+                      new_size);
+
+  d_probe_indices.resize(new_size, ctx.cuda_stream);
+
+  thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
+                 probe_indices_map.d_reordered_indices.begin(),
+                 probe_indices_map.d_reordered_indices.end(),
+                 probe_indices_map.d_uniq_indices.begin(), d_probe_indices.begin());
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(ctx.cuda_stream), d_probe_indices.begin(),
+                        d_probe_indices.end(), d_build_indices.begin());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_build_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_probe_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+  ctx.cuda_stream.synchronize();
+  return new_size;
+}
+
+uint32_t RTSpatialRefiner::RefinePipelined(const ArrowSchema* probe_schema,
+                                           const ArrowArray* probe_array,
+                                           Predicate predicate, uint32_t* build_indices,
+                                           uint32_t* probe_indices, uint32_t len) {
+  if (len == 0) return 0;
+  auto main_stream = stream_pool_->get_stream();
+
+  rmm::device_uvector<uint32_t> d_build_indices(len, main_stream);
+  rmm::device_uvector<uint32_t> d_probe_indices(len, main_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_build_indices.data(), build_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             main_stream));
+  CUDA_CHECK(cudaMemcpyAsync(d_probe_indices.data(), probe_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             main_stream));
+
+  thrust::sort_by_key(rmm::exec_policy_nosync(main_stream), d_probe_indices.begin(),
+                      d_probe_indices.end(), d_build_indices.begin());
+
+  rmm::device_uvector<uint32_t> d_final_build_indices(len, main_stream);
+  rmm::device_uvector<uint32_t> d_final_probe_indices(len, main_stream);
+
+  uint32_t tail_offset = 0;
+
+  // Capture device ID for thread safety
+  int device_id;
+  CUDA_CHECK(cudaGetDevice(&device_id));
+
+  // Pipeline Config
+  const int NUM_SLOTS = 2;
+  int n_batches = config_.pipeline_batches;
+  size_t batch_size = (len + n_batches - 1) / n_batches;
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p, pipeline refinement, total len %u, batches %d, batch size %zu",
+      this, len, n_batches, batch_size);
+
+  // Resource allocation for slots
+  using loader_t = ParallelWkbLoader<point_t, index_t>;
+  loader_t::Config loader_config;
+  loader_config.memory_quota =
+      config_.wkb_parser_memory_quota / config_.concurrency / NUM_SLOTS;
+
+  rmm::cuda_stream_pool local_pool(NUM_SLOTS);
+  std::vector<std::unique_ptr<detail::PipelineSlot<loader_t, dev_geometries_t>>> slots;
+
+  for (int i = 0; i < NUM_SLOTS; ++i) {
+    slots.push_back(std::make_unique<detail::PipelineSlot<loader_t, dev_geometries_t>>(
+        local_pool.get_stream(), thread_pool_, loader_config));
+  }
+
+  // Engine Setup (Shared across slots)
+  RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
+                                               config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+  re_config.memory_quota =
+      config_.relate_engine_memory_quota / config_.concurrency / NUM_SLOTS;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+  relate_engine.set_config(re_config);
+
+  // --- BACKGROUND TASK (CPU Phase) ---
+  // This lambda handles: buildIndicesMap + WKB Parsing
+  auto prepare_batch_task = [&](detail::PipelineSlot<loader_t, dev_geometries_t>* slot,
+                                size_t offset, size_t count) {
+    // 1. Critical: Set context for this thread
+    CUDA_CHECK(cudaSetDevice(device_id));
+
+    // 2. Wait for GPU to finish previous work on this slot
+    slot->stream.synchronize();
+
+    // 3. Prepare Indices (CPU + H2D)
+    const uint32_t* batch_probe_ptr = d_probe_indices.data() + offset;
+    buildIndicesMap(slot->stream, batch_probe_ptr, batch_probe_ptr + count,
+                    slot->indices_map);
+
+    // 4. Parse WKB (CPU Heavy)
+    slot->loader->Clear(slot->stream);
+    slot->loader->Parse(slot->stream, probe_schema, probe_array,
+                        slot->indices_map.h_uniq_indices.begin(),
+                        slot->indices_map.h_uniq_indices.end());
+
+    // Return future geometries (H2D copy happens on Finish)
+    return slot->loader->Finish(slot->stream);
+  };
+
+  // --- PIPELINE PRIMING ---
+  // Start processing Batch 0 immediately in background
+  size_t first_batch_len = std::min(batch_size, (size_t)len);
+  slots[0]->prep_future = std::async(std::launch::async, prepare_batch_task,
+                                     slots[0].get(), 0, first_batch_len);
+
+  main_stream.synchronize();  // Ensure allocation is done before main loop
+
+  // --- MAIN PIPELINE LOOP ---
+  for (size_t offset = 0; offset < len; offset += batch_size) {
+    int curr_idx = (offset / batch_size) % NUM_SLOTS;
+    int next_idx = (curr_idx + 1) % NUM_SLOTS;
+    auto& curr_slot = slots[curr_idx];
+    auto& next_slot = slots[next_idx];
+    size_t current_batch_len = std::min(batch_size, len - offset);
+
+    // 1. WAIT & RETRIEVE: Get Geometries from Background Task
+    // This will block only if CPU work for this batch is slower than GPU work for
+    // previous batch
+    dev_geometries_t probe_geoms;
+    if (curr_slot->prep_future.valid()) {
+      probe_geoms = std::move(curr_slot->prep_future.get());
+    }
+
+    // 2. KICKOFF NEXT: Start CPU work for Batch (N+1)
+    size_t next_offset = offset + batch_size;
+    if (next_offset < len) {
+      size_t next_len = std::min(batch_size, len - next_offset);
+      next_slot->prep_future = std::async(std::launch::async, prepare_batch_task,
+                                          next_slot.get(), next_offset, next_len);
+    }
+
+    // 3. GPU EXECUTION PHASE
+    const uint32_t* batch_build_ptr = d_build_indices.data() + offset;
+
+    // Copy build indices for this batch
+    curr_slot->d_batch_build_indices.resize(current_batch_len, curr_slot->stream);
+    CUDA_CHECK(cudaMemcpyAsync(curr_slot->d_batch_build_indices.data(), batch_build_ptr,
+                               sizeof(uint32_t) * current_batch_len,
+                               cudaMemcpyHostToDevice, curr_slot->stream));
+
+    // Relate/Refine
+    // Note: Evaluate filters d_batch_build_indices in-place
+    relate_engine.Evaluate(curr_slot->stream, probe_geoms, predicate,
+                           curr_slot->d_batch_build_indices,
+                           curr_slot->indices_map.d_reordered_indices);
+
+    // 4. GATHER & APPEND RESULTS
+    // We need the size to know how much to gather
+    size_t new_size = curr_slot->d_batch_build_indices.size();
+
+    if (new_size > 0) {
+      // Gather original probe indices
+      curr_slot->d_batch_probe_indices.resize(new_size, curr_slot->stream);
+      thrust::gather(rmm::exec_policy_nosync(curr_slot->stream),
+                     curr_slot->indices_map.d_reordered_indices.begin(),
+                     curr_slot->indices_map.d_reordered_indices.end(),
+                     curr_slot->indices_map.d_uniq_indices.begin(),
+                     curr_slot->d_batch_probe_indices.begin());
+
+      // Append to Final Buffers (Device-to-Device Copy)
+      CUDA_CHECK(cudaMemcpyAsync(d_final_build_indices.data() + tail_offset,
+                                 curr_slot->d_batch_build_indices.data(),
+                                 sizeof(uint32_t) * new_size, cudaMemcpyDeviceToDevice,
+                                 curr_slot->stream));
+
+      CUDA_CHECK(cudaMemcpyAsync(d_final_probe_indices.data() + tail_offset,
+                                 curr_slot->d_batch_probe_indices.data(),
+                                 sizeof(uint32_t) * new_size, cudaMemcpyDeviceToDevice,
+                                 curr_slot->stream));
+
+      tail_offset += new_size;
+    }
+  }
+
+  // --- FINALIZATION ---
+
+  // Wait for all streams to finish writing to final buffers
+  for (auto& slot : slots) {
+    slot->stream.synchronize();
+  }
+
+  // Shrink probe vector to actual size for sorting
+  d_final_probe_indices.resize(tail_offset, main_stream);
+  d_final_build_indices.resize(tail_offset, main_stream);
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(main_stream),
+                        d_final_probe_indices.begin(),
+                        d_final_probe_indices.end(),  // Sort only valid range
+                        d_final_build_indices.begin());
+  }
+
+  // Final Copy to Host
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_final_build_indices.data(),
+                             sizeof(uint32_t) * tail_offset, cudaMemcpyDeviceToHost,
+                             main_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_final_probe_indices.data(),
+                             sizeof(uint32_t) * tail_offset, cudaMemcpyDeviceToHost,
+                             main_stream));
+
+  main_stream.synchronize();
+  return tail_offset;
+}
+
+uint32_t RTSpatialRefiner::Refine(const ArrowSchema* build_schema,
+                                  const ArrowArray* build_array,
+                                  const ArrowSchema* probe_schema,
+                                  const ArrowArray* probe_array, Predicate predicate,
+                                  uint32_t* build_indices, uint32_t* probe_indices,
+                                  uint32_t len) {
+  if (len == 0) {
+    return 0;
+  }
+
+  auto cuda_stream = stream_pool_->get_stream();
+  SpatialRefinerContext ctx;
+
+  ctx.cuda_stream = cuda_stream;
+
+  IndicesMap build_indices_map, probe_indices_map;
+  rmm::device_uvector<uint32_t> d_indices(len, cuda_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_indices.data(), build_indices, sizeof(uint32_t) * len,
+                             cudaMemcpyHostToDevice, cuda_stream));
+  buildIndicesMap(cuda_stream, d_indices.begin(), d_indices.end(), build_indices_map);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_indices.data(), probe_indices, sizeof(uint32_t) * len,
+                             cudaMemcpyHostToDevice, cuda_stream));
+  buildIndicesMap(cuda_stream, d_indices.begin(), d_indices.end(), probe_indices_map);
+  d_indices.resize(0, cuda_stream);
+  d_indices.shrink_to_fit(cuda_stream);
+
+  loader_t loader(thread_pool_);
+  loader_t::Config loader_config;
+  loader_config.memory_quota = config_.wkb_parser_memory_quota / config_.concurrency;
+  loader.Init(loader_config);
+  loader.Parse(ctx.cuda_stream, build_schema, build_array,
+               build_indices_map.h_uniq_indices.begin(),
+               build_indices_map.h_uniq_indices.end());
+  auto geoms1 = std::move(loader.Finish(ctx.cuda_stream));
+
+  loader.Clear(ctx.cuda_stream);
+  loader.Parse(ctx.cuda_stream, probe_schema, probe_array,
+               probe_indices_map.h_uniq_indices.begin(),
+               probe_indices_map.h_uniq_indices.end());
+  auto geoms2 = std::move(loader.Finish(ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Loaded Geometries, build_array %ld, Loaded %u, Type %s, probe_array %ld, Loaded %u, Type %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, build_array->length,
+      geoms1.num_features(), GeometryTypeToString(geoms1.get_geometry_type()).c_str(),
+      probe_array->length, geoms2.num_features(),
+      GeometryTypeToString(geoms2.get_geometry_type()).c_str());
+
+  RelateEngine<point_t, index_t> relate_engine(&geoms1, config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+
+  re_config.memory_quota = config_.relate_engine_memory_quota / config_.concurrency;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+
+  relate_engine.set_config(re_config);
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Evaluating %u Geometry Pairs with Predicate %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, len,
+      PredicateToString(predicate));
+
+  ctx.timer.start(ctx.cuda_stream);
+
+  relate_engine.Evaluate(ctx.cuda_stream, geoms2, predicate,
+                         build_indices_map.d_reordered_indices,
+                         probe_indices_map.d_reordered_indices);
+  float refine_ms = ctx.timer.stop(ctx.cuda_stream);
+
+  auto new_size = build_indices_map.d_reordered_indices.size();
+  GPUSPATIAL_LOG_INFO("RTSpatialRefiner %p (Free %zu MB), Refine time %f, new size %zu",
+                      this, rmm::available_device_memory().first / 1024 / 1024, refine_ms,
+                      new_size);
+  rmm::device_uvector<uint32_t> d_build_indices(new_size, ctx.cuda_stream);
+  rmm::device_uvector<uint32_t> d_probe_indices(new_size, ctx.cuda_stream);
+
+  thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
+                 build_indices_map.d_reordered_indices.begin(),
+                 build_indices_map.d_reordered_indices.end(),
+                 build_indices_map.d_uniq_indices.begin(), d_build_indices.begin());
+
+  thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
+                 probe_indices_map.d_reordered_indices.begin(),
+                 probe_indices_map.d_reordered_indices.end(),
+                 probe_indices_map.d_uniq_indices.begin(), d_probe_indices.begin());
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(ctx.cuda_stream), d_probe_indices.begin(),
+                        d_probe_indices.end(), d_build_indices.begin());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_build_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_probe_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+  ctx.cuda_stream.synchronize();
+  return new_size;
+}
+
+template <typename INDEX_IT>
+void RTSpatialRefiner::buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                                       INDEX_IT index_end,
+                                       IndicesMap& indices_map) const {
+  auto len = thrust::distance(index_begin, index_end);
+  auto& d_uniq_indices = indices_map.d_uniq_indices;
+  auto& h_uniq_indices = indices_map.h_uniq_indices;
+
+  d_uniq_indices.resize(len, stream);
+  CUDA_CHECK(cudaMemcpyAsync(d_uniq_indices.data(), index_begin, sizeof(uint32_t) * len,
+                             cudaMemcpyDeviceToDevice, stream));
+
+  thrust::sort(rmm::exec_policy_nosync(stream), d_uniq_indices.begin(),
+               d_uniq_indices.end());
+  auto uniq_end = thrust::unique(rmm::exec_policy_nosync(stream), d_uniq_indices.begin(),
+                                 d_uniq_indices.end());
+  auto uniq_size = thrust::distance(d_uniq_indices.begin(), uniq_end);
+
+  d_uniq_indices.resize(uniq_size, stream);
+  h_uniq_indices.resize(uniq_size);
+
+  CUDA_CHECK(cudaMemcpyAsync(h_uniq_indices.data(), d_uniq_indices.data(),
+                             sizeof(uint32_t) * uniq_size, cudaMemcpyDeviceToHost,
+                             stream));
+
+  auto& d_reordered_indices = indices_map.d_reordered_indices;
+
+  d_reordered_indices.resize(len, stream);
+  detail::ReorderIndices(stream, index_begin, index_end, d_uniq_indices,
+                         d_reordered_indices);
+}
+
+std::unique_ptr<SpatialRefiner> CreateRTSpatialRefiner(
+    const RTSpatialRefinerConfig& config) {
+  auto refiner = std::make_unique<RTSpatialRefiner>(config);
+  GPUSPATIAL_LOG_INFO(
+      "Create RTSpatialRefiner %p, fast_build = %d, compact = %d, "
+      "parsing_threads = %u, concurrency = %u, pipeline_batches = %u, "
+      "wkb_parser_memory_quota = %.2f, relate_engine_memory_quota = %.2f",
+      refiner.get(), config.prefer_fast_build, config.compact, config.parsing_threads,
+      config.concurrency, config.pipeline_batches, config.wkb_parser_memory_quota,
+      config.relate_engine_memory_quota);
+  return std::move(refiner);
+}
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu b/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu
deleted file mode 100644
index 03aafaa27..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu
+++ /dev/null
@@ -1,483 +0,0 @@
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/index/spatial_joiner.cuh"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/utils/logger.hpp"
-#include "gpuspatial/utils/stopwatch.h"
-
-#include "rt/shaders/shader_id.hpp"
-
-#include "rmm/exec_policy.hpp"
-
-#define OPTIX_MAX_RAYS (1lu << 30)
-namespace gpuspatial {
-
-namespace detail {
-
-template <int N_DIM>
-static rmm::device_uvector<OptixAabb> ComputeAABBs(
-    rmm::cuda_stream_view stream, const ArrayView<Box<Point<float, N_DIM>>>& mbrs) {
-  rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
-
-  thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
-                    aabbs.begin(), [] __device__(const Box<Point<float, N_DIM>>& mbr) {
-                      OptixAabb aabb{0, 0, 0, 0, 0, 0};
-                      auto min_corner = mbr.get_min();
-                      auto max_corner = mbr.get_max();
-                      for (int dim = 0; dim < N_DIM; dim++) {
-                        (&aabb.minX)[dim] = min_corner[dim];
-                        (&aabb.maxX)[dim] = max_corner[dim];
-                      }
-                      return aabb;
-                    });
-  return std::move(aabbs);
-}
-
-}  // namespace detail
-
-void SpatialJoiner::Init(const Config* config) {
-  config_ = *dynamic_cast<const SpatialJoinerConfig*>(config);
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), Initialize, Concurrency %u", this,
-                      rmm::available_device_memory().first / 1024 / 1024,
-                      config_.concurrency);
-  details::RTConfig rt_config = details::get_default_rt_config(config_.ptx_root);
-  rt_engine_.Init(rt_config);
-
-  loader_t::Config loader_config;
-
-  thread_pool_ = std::make_shared<ThreadPool>(config_.parsing_threads);
-  build_loader_ = std::make_unique<loader_t>(thread_pool_);
-  build_loader_->Init(loader_config);
-  stream_pool_ = std::make_unique<rmm::cuda_stream_pool>(config_.concurrency);
-  ctx_pool_ = ObjectPool<SpatialJoinerContext>::create(config_.concurrency);
-  CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, config_.stack_size_bytes));
-  Clear();
-}
-
-void SpatialJoiner::Clear() {
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), Clear", this,
-                      rmm::available_device_memory().first / 1024 / 1024);
-  bvh_buffer_ = nullptr;
-  geometry_grouper_.Clear();
-  auto stream = rmm::cuda_stream_default;
-  build_loader_->Clear(stream);
-  build_geometries_.Clear(stream);
-  stream.synchronize();
-}
-
-void SpatialJoiner::PushBuild(const ArrowSchema* schema, const ArrowArray* array,
-                              int64_t offset, int64_t length) {
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), PushBuild, offset %ld, length %ld",
-                      this, rmm::available_device_memory().first / 1024 / 1024, offset,
-                      length);
-  build_loader_->Parse(rmm::cuda_stream_default, array, offset, length);
-}
-
-void SpatialJoiner::FinishBuilding() {
-  auto stream = rmm::cuda_stream_default;
-
-  build_geometries_ = std::move(build_loader_->Finish(stream));
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p (Free %zu MB), FinishBuilding, n_features: %ld, type %s", this,
-      rmm::available_device_memory().first / 1024 / 1024,
-      build_geometries_.num_features(),
-      GeometryTypeToString(build_geometries_.get_geometry_type()));
-
-  if (build_geometries_.get_geometry_type() == GeometryType::kPoint) {
-    geometry_grouper_.Group(stream, build_geometries_, config_.n_points_per_aabb);
-    handle_ = buildBVH(stream, geometry_grouper_.get_aabbs(), bvh_buffer_);
-  } else {
-    auto aabbs = detail::ComputeAABBs(stream, build_geometries_.get_mbrs());
-    handle_ = buildBVH(stream, ArrayView<OptixAabb>(aabbs), bvh_buffer_);
-  }
-
-  relate_engine_ = RelateEngine(&build_geometries_, &rt_engine_);
-  RelateEngine<point_t, index_t>::Config re_config;
-
-  re_config.memory_quota = config_.relate_engine_memory_quota;
-  re_config.bvh_fast_build = config_.prefer_fast_build;
-  re_config.bvh_fast_compact = config_.compact;
-
-  relate_engine_.set_config(re_config);
-}
-
-void SpatialJoiner::PushStream(Context* base_ctx, const ArrowSchema* schema,
-                               const ArrowArray* array, int64_t offset, int64_t length,
-                               Predicate predicate, std::vector<uint32_t>* build_indices,
-                               std::vector<uint32_t>* stream_indices,
-                               int32_t array_index_offset) {
-  auto* ctx = (SpatialJoinerContext*)base_ctx;
-  ctx->cuda_stream = stream_pool_->get_stream();
-
-#ifdef GPUSPATIAL_PROFILING
-  Stopwatch sw;
-  sw.start();
-#endif
-  ctx->array_index_offset = array_index_offset;
-
-  if (ctx->stream_loader == nullptr) {
-    ctx->stream_loader = std::make_unique<loader_t>(thread_pool_);
-    loader_t::Config loader_config;
-
-    ctx->stream_loader->Init(loader_config);
-  }
-  ctx->stream_loader->Parse(ctx->cuda_stream, array, offset, length);
-  ctx->stream_geometries = std::move(ctx->stream_loader->Finish(ctx->cuda_stream));
-
-  auto build_type = build_geometries_.get_geometry_type();
-  auto stream_type = ctx->stream_geometries.get_geometry_type();
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, PushStream, build features %zu, type %s, stream features %zu, type %s",
-      this, build_geometries_.num_features(),
-      GeometryTypeToString(build_geometries_.get_geometry_type()),
-      ctx->stream_geometries.num_features(),
-      GeometryTypeToString(ctx->stream_geometries.get_geometry_type()));
-
-#ifdef GPUSPATIAL_PROFILING
-  sw.stop();
-  ctx->parse_ms += sw.ms();
-#endif
-
-  if (build_type == GeometryType::kPoint) {
-    if (stream_type == GeometryType::kPoint) {
-      handleBuildPointStreamPoint(ctx, predicate, build_indices, stream_indices);
-    } else {
-      handleBuildPointStreamBox(ctx, predicate, build_indices, stream_indices);
-    }
-  } else {
-    if (stream_type == GeometryType::kPoint) {
-      handleBuildBoxStreamPoint(ctx, predicate, build_indices, stream_indices);
-    } else {
-      handleBuildBoxStreamBox(ctx, predicate, build_indices, stream_indices);
-    }
-  }
-#ifdef GPUSPATIAL_PROFILING
-  printf("parse %lf, alloc %lf, filter %lf, refine %lf, copy_res %lf ms\n", ctx->parse_ms,
-         ctx->alloc_ms, ctx->filter_ms, ctx->refine_ms, ctx->copy_res_ms);
-#endif
-}
-
-void SpatialJoiner::handleBuildPointStreamPoint(SpatialJoinerContext* ctx,
-                                                Predicate predicate,
-                                                std::vector<uint32_t>* build_indices,
-                                                std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(ctx->stream_geometries.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  launch_params.grouped = true;
-  launch_params.prefix_sum = geometry_grouper_.get_prefix_sum();
-  launch_params.reordered_indices = geometry_grouper_.get_reordered_indices();
-  launch_params.mbrs1 = ArrayView<box_t>();  // no MBRs for point
-  launch_params.points2 = ctx->stream_geometries.get_points();
-  launch_params.handle = handle_;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-  filter(ctx, dim_x);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildBoxStreamPoint(SpatialJoinerContext* ctx,
-                                              Predicate predicate,
-                                              std::vector<uint32_t>* build_indices,
-                                              std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(ctx->stream_geometries.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  launch_params.grouped = false;
-  launch_params.mbrs1 = build_geometries_.get_mbrs();
-  launch_params.points2 = ctx->stream_geometries.get_points();
-  launch_params.handle = handle_;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-  filter(ctx, dim_x);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildPointStreamBox(SpatialJoinerContext* ctx,
-                                              Predicate predicate,
-                                              std::vector<uint32_t>* build_indices,
-                                              std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(build_geometries_.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  auto aabbs = detail::ComputeAABBs(ctx->cuda_stream, ctx->stream_geometries.get_mbrs());
-  auto handle = buildBVH(ctx->cuda_stream, ArrayView<OptixAabb>(aabbs), ctx->bvh_buffer);
-
-  // mbrs1 are from stream; points2 are from build
-  launch_params.grouped = false;
-  launch_params.mbrs1 = ctx->stream_geometries.get_mbrs();
-  launch_params.points2 = build_geometries_.get_points();
-  launch_params.handle = handle;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, build_geometries_.num_features());
-  // IMPORTANT: In this case, the BVH is built from stream geometries and points2 are
-  // build geometries, so the result pairs are (stream_id, build_id) instead of (build_id,
-  // stream_id). We need to swap the output buffers to correct this.
-  filter(ctx, dim_x, true);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildBoxStreamBox(SpatialJoinerContext* ctx,
-                                            Predicate predicate,
-                                            std::vector<uint32_t>* build_indices,
-                                            std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  // forward cast: cast rays from stream geometries with the BVH of build geometries
-  {
-    auto dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-    prepareLaunchParamsBoxQuery(ctx, true);
-    filter(ctx, dim_x);
-    refine(ctx, predicate, build_indices, stream_indices);
-    ctx->results.Clear(ctx->cuda_stream);  // results have been copied, reuse space
-  }
-  // need allocate again as the previous results buffer has been shrinked to fit
-  allocateResultBuffer(ctx);
-  // backward cast: cast rays from the build geometries with the BVH of stream geometries
-  {
-    auto dim_x = std::min(OPTIX_MAX_RAYS, build_geometries_.num_features());
-    auto v_mbrs = ctx->stream_geometries.get_mbrs();
-    rmm::device_uvector<OptixAabb> aabbs(v_mbrs.size(), ctx->cuda_stream);
-
-    thrust::transform(rmm::exec_policy_nosync(ctx->cuda_stream), v_mbrs.begin(),
-                      v_mbrs.end(), aabbs.begin(),
-                      [] __device__(const box_t& mbr) { return mbr.ToOptixAabb(); });
-
-    // Build a BVH over the MBRs of the stream geometries
-    ctx->handle =
-        buildBVH(ctx->cuda_stream, ArrayView<OptixAabb>(aabbs.data(), aabbs.size()),
-                 ctx->bvh_buffer);
-    prepareLaunchParamsBoxQuery(ctx, false);
-    filter(ctx, dim_x);
-    refine(ctx, predicate, build_indices, stream_indices);
-  }
-}
-
-OptixTraversableHandle SpatialJoiner::buildBVH(
-    const rmm::cuda_stream_view& stream, const ArrayView<OptixAabb>& aabbs,
-    std::unique_ptr<rmm::device_buffer>& buffer) {
-  auto buffer_size_bytes = rt_engine_.EstimateMemoryUsageForAABB(
-      aabbs.size(), config_.prefer_fast_build, config_.compact);
-
-  if (buffer == nullptr || buffer->size() < buffer_size_bytes) {
-    buffer = std::make_unique<rmm::device_buffer>(buffer_size_bytes, stream);
-  }
-
-  return rt_engine_.BuildAccelCustom(stream, aabbs, *buffer, config_.prefer_fast_build,
-                                     config_.compact);
-}
-
-void SpatialJoiner::allocateResultBuffer(SpatialJoinerContext* ctx) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  int64_t avail_bytes = rmm::available_device_memory().first;
-  auto stream_type = ctx->stream_geometries.get_geometry_type();
-  if (stream_type != GeometryType::kPoint) {
-    // need to reserve space for the BVH of stream
-    auto n_aabbs = ctx->stream_geometries.get_mbrs().size();
-
-    avail_bytes -= rt_engine_.EstimateMemoryUsageForAABB(
-        n_aabbs, config_.prefer_fast_build, config_.compact);
-  }
-
-  if (avail_bytes <= 0) {
-    throw std::runtime_error(
-        "Not enough memory to allocate result space for spatial index");
-  }
-
-  uint64_t reserve_bytes = ceil(avail_bytes * config_.result_buffer_memory_reserve_ratio);
-  reserve_bytes = reserve_bytes / config_.concurrency + 1;
-  // two index_t for each result pair (build index, stream index) and another index_t for
-  // the temp storage
-  uint32_t n_items = reserve_bytes / (2 * sizeof(index_t) + sizeof(index_t));
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, Allocate result buffer quota %zu MB, queue size %u", this,
-      reserve_bytes / 1024 / 1024, n_items);
-
-  ctx->results.Init(ctx->cuda_stream, n_items);
-  ctx->results.Clear(ctx->cuda_stream);
-#ifdef GPUSPATIAL_PROFILING
-  ctx->alloc_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-}
-
-void SpatialJoiner::prepareLaunchParamsBoxQuery(SpatialJoinerContext* ctx, bool foward) {
-  using launch_params_t = detail::LaunchParamsBoxQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  assert(ctx->stream_geometries.get_geometry_type() != GeometryType::kPoint);
-
-  launch_params.mbrs1 = build_geometries_.get_mbrs();
-  launch_params.mbrs2 = ctx->stream_geometries.get_mbrs();
-  if (foward) {
-    launch_params.handle = handle_;
-    ctx->shader_id = GetBoxQueryForwardShaderId<point_t>();
-  } else {
-    launch_params.handle = ctx->handle;
-    ctx->shader_id = GetBoxQueryBackwardShaderId<point_t>();
-  }
-
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-}
-
-void SpatialJoiner::filter(SpatialJoinerContext* ctx, uint32_t dim_x, bool swap_id) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  Stopwatch sw;
-  sw.start();
-  if (dim_x > 0) {
-    rt_engine_.Render(ctx->cuda_stream, ctx->shader_id, dim3{dim_x, 1, 1},
-                      ArrayView<char>((char*)ctx->launch_params_buffer->data(),
-                                      ctx->launch_params_buffer->size()));
-  }
-  auto result_size = ctx->results.size(ctx->cuda_stream);
-  sw.stop();
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, Filter stage, Launched %u rays, Found %u candidates, time %lf ms",
-      this, dim_x, result_size, sw.ms());
-  if (swap_id && result_size > 0) {
-    // swap the pair (build_id, stream_id) to (stream_id, build_id)
-    thrust::for_each(rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-                     ctx->results.data() + result_size,
-                     [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                       thrust::swap(pair.first, pair.second);
-                     });
-  }
-  ctx->results.shrink_to_fit(ctx->cuda_stream);
-
-#ifdef GPUSPATIAL_PROFILING
-  ctx->filter_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-}
-
-void SpatialJoiner::refine(SpatialJoinerContext* ctx, Predicate predicate,
-                           std::vector<uint32_t>* build_indices,
-                           std::vector<uint32_t>* stream_indices) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  relate_engine_.Evaluate(ctx->cuda_stream, ctx->stream_geometries, predicate,
-                          ctx->results);
-#ifdef GPUSPATIAL_PROFILING
-  ctx->refine_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-  auto n_results = ctx->results.size(ctx->cuda_stream);
-
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  rmm::device_uvector<uint32_t> tmp_result_buffer(n_results, ctx->cuda_stream);
-
-  thrust::transform(
-      rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-      ctx->results.data() + n_results, tmp_result_buffer.begin(),
-      [] __device__(const thrust::pair<index_t, index_t>& pair) -> uint32_t {
-        return pair.first;
-      });
-  auto prev_size = build_indices->size();
-  build_indices->resize(build_indices->size() + n_results);
-
-  CUDA_CHECK(cudaMemcpyAsync(build_indices->data() + prev_size, tmp_result_buffer.data(),
-                             sizeof(uint32_t) * n_results, cudaMemcpyDeviceToHost,
-                             ctx->cuda_stream));
-
-  auto array_index_offset = ctx->array_index_offset;
-
-  thrust::transform(
-      rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-      ctx->results.data() + n_results, tmp_result_buffer.begin(),
-      [=] __device__(const thrust::pair<index_t, index_t>& pair) -> uint32_t {
-        return pair.second + array_index_offset;
-      });
-
-  stream_indices->resize(stream_indices->size() + n_results);
-
-  CUDA_CHECK(cudaMemcpyAsync(stream_indices->data() + prev_size, tmp_result_buffer.data(),
-                             sizeof(uint32_t) * n_results, cudaMemcpyDeviceToHost,
-                             ctx->cuda_stream));
-#ifdef GPUSPATIAL_PROFILING
-  ctx->copy_res_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-  ctx->cuda_stream.synchronize();
-}
-
-std::unique_ptr<StreamingJoiner> CreateSpatialJoiner() {
-  return std::make_unique<SpatialJoiner>();
-}
-
-void InitSpatialJoiner(StreamingJoiner* index, const char* ptx_root,
-                       uint32_t concurrency) {
-  SpatialJoiner::SpatialJoinerConfig config;
-  config.ptx_root = ptx_root;
-  config.concurrency = concurrency;
-  index->Init(&config);
-}
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt b/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
index 719d0909f..bcf69239f 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
@@ -53,8 +53,19 @@ if(GPUSPATIAL_BUILD_TESTS)
                          PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
                                  --expt-relaxed-constexpr>)
 
-  add_executable(joiner_test main.cc array_stream.cc joiner_test.cu)
-  target_link_libraries(joiner_test
+  add_executable(index_test main.cc index_test.cu)
+  target_link_libraries(index_test
+                        PRIVATE cuda
+                                GTest::gtest_main
+                                GTest::gmock_main
+                                gpuspatial
+                                GEOS::geos
+                                GEOS::geos_c)
+  target_compile_options(index_test
+                         PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
+                                 --expt-relaxed-constexpr>)
+  add_executable(refiner_test main.cc array_stream.cc refiner_test.cu)
+  target_link_libraries(refiner_test
                         PRIVATE cuda
                                 GTest::gtest_main
                                 GTest::gmock_main
@@ -65,7 +76,7 @@ if(GPUSPATIAL_BUILD_TESTS)
                                 Arrow::arrow_static
                                 Parquet::parquet_static
                                 nanoarrow::nanoarrow_ipc)
-  target_compile_options(joiner_test
+  target_compile_options(refiner_test
                          PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
                                  --expt-relaxed-constexpr>)
 
@@ -83,14 +94,19 @@ if(GPUSPATIAL_BUILD_TESTS)
                                  --expt-relaxed-constexpr>)
 
   add_executable(c_wrapper_test main.cc c_wrapper_test.cc array_stream.cc)
-  target_link_libraries(c_wrapper_test PRIVATE GTest::gtest_main GTest::gmock_main
-                                               gpuspatial_c nanoarrow::nanoarrow_ipc)
+  target_link_libraries(c_wrapper_test
+                        PRIVATE GTest::gtest_main
+                                GTest::gmock_main
+                                gpuspatial_c
+                                GEOS::geos
+                                GEOS::geos_c
+                                geoarrow_geos
+                                nanoarrow::nanoarrow_ipc)
 
   include(GoogleTest)
 
   gtest_discover_tests(gpuspatial_testing_test)
   gtest_discover_tests(array_stream_test)
   gtest_discover_tests(loader_test)
-  gtest_discover_tests(joiner_test)
   gtest_discover_tests(relate_test)
 endif()
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc b/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
index 60c247399..c56d2f1da 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
@@ -24,40 +24,136 @@
 #include <random>
 #include <vector>
 #include "array_stream.hpp"
+#include "geoarrow_geos/geoarrow_geos.hpp"
 #include "nanoarrow/nanoarrow.hpp"
 
-namespace TestUtils {
-std::string GetTestDataPath(const std::string& relative_path_to_file);
+TEST(RuntimeTest, InitializeRuntime) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  config.ptx_root = ptx_root.c_str();
+  config.device_id = 0;
+  config.use_cuda_memory_pool = false;
+  ASSERT_EQ(runtime.init(&runtime, &config), 0);
+
+  runtime.release(&runtime);
+}
+
+TEST(RuntimeTest, ErrorTest) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  runtime_config.ptx_root = "/invalid/path/to/ptx";
+  runtime_config.device_id = 0;
+  runtime_config.use_cuda_memory_pool = false;
+
+  EXPECT_NE(runtime.init(&runtime, &runtime_config), 0);
+
+  const char* raw_error = runtime.get_last_error(&runtime);
+  printf("Error received: %s\n", raw_error);
+
+  std::string error_msg(raw_error);
+
+  EXPECT_NE(error_msg.find("No such file or directory"), std::string::npos)
+      << "Error message was corrupted or incorrect. Got: " << error_msg;
+
+  runtime.release(&runtime);
+}
+
+TEST(SpatialIndexTest, InitializeIndex) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  runtime_config.ptx_root = ptx_root.c_str();
+  runtime_config.device_id = 0;
+  runtime_config.use_cuda_memory_pool = true;
+  runtime_config.cuda_memory_pool_init_precent = 10;
+  ASSERT_EQ(runtime.init(&runtime, &runtime_config), 0);
+
+  SedonaFloatIndex2D index;
+  GpuSpatialIndexConfig index_config;
+
+  index_config.runtime = &runtime;
+  index_config.concurrency = 1;
+
+  ASSERT_EQ(GpuSpatialIndexFloat2DCreate(&index, &index_config), 0);
+
+  index.release(&index);
+  runtime.release(&runtime);
+}
+
+TEST(RefinerTest, InitializeRefiner) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  runtime_config.ptx_root = ptx_root.c_str();
+  runtime_config.device_id = 0;
+  runtime_config.use_cuda_memory_pool = true;
+  runtime_config.cuda_memory_pool_init_precent = 10;
+  ASSERT_EQ(runtime.init(&runtime, &runtime_config), 0);
+
+  SedonaSpatialRefiner refiner;
+  GpuSpatialRefinerConfig refiner_config;
+
+  refiner_config.runtime = &runtime;
+  refiner_config.concurrency = 1;
+
+  ASSERT_EQ(GpuSpatialRefinerCreate(&refiner, &refiner_config), 0);
+
+  refiner.release(&refiner);
+  runtime.release(&runtime);
 }
 
 class CWrapperTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // Initialize the GpuSpatialJoiner
-    GpuSpatialJoinerCreate(&joiner_);
-    struct GpuSpatialJoinerConfig config_;
-    std::string ptx_root = TestUtils::GetTestDataPath("shaders_ptx");
+    std::string ptx_root = TestUtils::GetTestShaderPath();
+
+    GpuSpatialRuntimeCreate(&runtime_);
+    GpuSpatialRuntimeConfig runtime_config;
+
+    runtime_config.ptx_root = ptx_root.c_str();
+    runtime_config.device_id = 0;
+    runtime_config.use_cuda_memory_pool = true;
+    runtime_config.cuda_memory_pool_init_precent = 10;
+    ASSERT_EQ(runtime_.init(&runtime_, &runtime_config), 0);
+
+    GpuSpatialIndexConfig index_config;
+
+    index_config.runtime = &runtime_;
+    index_config.concurrency = 1;
+
+    ASSERT_EQ(GpuSpatialIndexFloat2DCreate(&index_, &index_config), 0);
 
-    // Set up the configuration
-    config_.concurrency = 2;  // Example concurrency level
-    config_.ptx_root = ptx_root.c_str();
+    GpuSpatialRefinerConfig refiner_config;
 
-    ASSERT_EQ(joiner_.init(&joiner_, &config_), 0);
-    // Initialize the context
+    refiner_config.runtime = &runtime_;
+    refiner_config.concurrency = 1;
+
+    ASSERT_EQ(GpuSpatialRefinerCreate(&refiner_, &refiner_config), 0);
   }
 
   void TearDown() override {
-    // Clean up
-    joiner_.release(&joiner_);
+    refiner_.release(&refiner_);
+    index_.release(&index_);
+    runtime_.release(&runtime_);
   }
-
-  struct GpuSpatialJoiner joiner_;
+  GpuSpatialRuntime runtime_;
+  SedonaFloatIndex2D index_;
+  SedonaSpatialRefiner refiner_;
 };
 
 TEST_F(CWrapperTest, InitializeJoiner) {
+  using fpoint_t = gpuspatial::Point<float, 2>;
+  using box_t = gpuspatial::Box<fpoint_t>;
   // Test if the joiner initializes correctly
-  struct GpuSpatialJoinerContext context_;
-  joiner_.create_context(&joiner_, &context_);
 
   auto poly_path = TestUtils::GetTestDataPath("arrowipc/test_polygons.arrows");
   auto point_path = TestUtils::GetTestDataPath("arrowipc/test_points.arrows");
@@ -73,6 +169,8 @@ TEST_F(CWrapperTest, InitializeJoiner) {
 
   int n_row_groups = 100;
 
+  geoarrow::geos::ArrayReader reader;
+
   for (int i = 0; i < n_row_groups; i++) {
     ASSERT_EQ(ArrowArrayStreamGetNext(poly_stream.get(), build_array.get(), &error),
               NANOARROW_OK);
@@ -84,23 +182,138 @@ TEST_F(CWrapperTest, InitializeJoiner) {
     ASSERT_EQ(ArrowArrayStreamGetSchema(point_stream.get(), stream_schema.get(), &error),
               NANOARROW_OK);
 
-    joiner_.push_build(&joiner_, build_schema.get(), build_array.get(), 0,
-                       build_array->length);
-    joiner_.finish_building(&joiner_);
+    class GEOSCppHandle {
+     public:
+      GEOSContextHandle_t handle;
+
+      GEOSCppHandle() { handle = GEOS_init_r(); }
+
+      ~GEOSCppHandle() { GEOS_finish_r(handle); }
+    };
+    GEOSCppHandle handle;
+
+    reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
+
+    geoarrow::geos::GeometryVector geom_build(handle.handle);
+
+    geom_build.resize(build_array->length);
+    size_t n_build;
+
+    ASSERT_EQ(reader.Read(build_array.get(), 0, build_array->length,
+                          geom_build.mutable_data(), &n_build),
+              GEOARROW_GEOS_OK);
+    auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
+    std::vector<box_t> rects;
+
+    for (size_t build_idx = 0; build_idx < build_array->length; build_idx++) {
+      auto* geom = geom_build.borrow(build_idx);
+      auto* box = GEOSEnvelope_r(handle.handle, geom);
+
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, box, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+
+      rects.push_back(bbox);
+
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)build_idx);
+      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
+      GEOSGeom_destroy_r(handle.handle, box);
+    }
+
+    index_.clear(&index_);
+    ASSERT_EQ(index_.push_build(&index_, (float*)rects.data(), rects.size()), 0);
+    ASSERT_EQ(index_.finish_building(&index_), 0);
 
-    joiner_.push_stream(&joiner_, &context_, stream_schema.get(), stream_array.get(), 0,
-                        stream_array->length, GpuSpatialPredicateContains, 0);
+    geoarrow::geos::GeometryVector geom_stream(handle.handle);
+    size_t n_stream;
+    geom_stream.resize(stream_array->length);
 
-    void* build_indices_ptr;
-    void* stream_indices_ptr;
+    ASSERT_EQ(reader.Read(stream_array.get(), 0, stream_array->length,
+                          geom_stream.mutable_data(), &n_stream),
+              GEOARROW_GEOS_OK);
+
+    std::vector<box_t> queries;
+
+    for (size_t stream_idx = 0; stream_idx < stream_array->length; stream_idx++) {
+      auto* geom = geom_stream.borrow(stream_idx);
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, geom, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      queries.push_back(bbox);
+    }
+
+    SedonaSpatialIndexContext idx_ctx;
+    index_.create_context(&idx_ctx);
+
+    index_.probe(&index_, &idx_ctx, (float*)queries.data(), queries.size());
+
+    uint32_t* build_indices_ptr;
+    uint32_t* probe_indices_ptr;
     uint32_t build_indices_length;
-    uint32_t stream_indices_length;
+    uint32_t probe_indices_length;
 
-    joiner_.get_build_indices_buffer(&context_, (void**)&build_indices_ptr,
-                                     &build_indices_length);
-    joiner_.get_stream_indices_buffer(&context_, (void**)&stream_indices_ptr,
-                                      &stream_indices_length);
-  }
+    index_.get_build_indices_buffer(&idx_ctx, &build_indices_ptr, &build_indices_length);
+    index_.get_probe_indices_buffer(&idx_ctx, &probe_indices_ptr, &probe_indices_length);
+
+    uint32_t new_len;
+    ASSERT_EQ(
+        refiner_.refine(&refiner_, build_schema.get(), build_array.get(),
+                        stream_schema.get(), stream_array.get(),
+                        SedonaSpatialRelationPredicate::SedonaSpatialPredicateContains,
+                        (uint32_t*)build_indices_ptr, (uint32_t*)probe_indices_ptr,
+                        build_indices_length, &new_len),
+        0);
 
-  joiner_.destroy_context(&context_);
+    std::vector<uint32_t> build_indices((uint32_t*)build_indices_ptr,
+                                        (uint32_t*)build_indices_ptr + new_len);
+    std::vector<uint32_t> probe_indices((uint32_t*)probe_indices_ptr,
+                                        (uint32_t*)probe_indices_ptr + new_len);
+
+    struct Payload {
+      GEOSContextHandle_t handle;
+      const GEOSGeometry* geom;
+      std::vector<uint32_t> build_indices;
+      std::vector<uint32_t> stream_indices;
+      SedonaSpatialRelationPredicate predicate;
+    };
+
+    Payload payload;
+    payload.predicate = SedonaSpatialRelationPredicate::SedonaSpatialPredicateContains;
+    payload.handle = handle.handle;
+
+    for (size_t offset = 0; offset < n_stream; offset++) {
+      auto* geom = geom_stream.borrow(offset);
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      payload.geom = geom;
+
+      GEOSSTRtree_query_r(
+          handle.handle, tree, geom,
+          [](void* item, void* data) {
+            auto* geom_build = (GEOSGeometry*)item;
+            auto* payload = (Payload*)data;
+            auto* geom_stream = payload->geom;
+
+            if (GEOSContains_r(payload->handle, geom_build, geom_stream) == 1) {
+              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
+              auto stream_id =
+                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
+              payload->build_indices.push_back(build_id);
+              payload->stream_indices.push_back(stream_id);
+            }
+          },
+          (void*)&payload);
+    }
+
+    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
+    ASSERT_EQ(payload.stream_indices.size(), probe_indices.size());
+    TestUtils::sort_vectors_by_index(payload.build_indices, payload.stream_indices);
+    TestUtils::sort_vectors_by_index(build_indices, probe_indices);
+    for (size_t j = 0; j < build_indices.size(); j++) {
+      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
+      ASSERT_EQ(payload.stream_indices[j], probe_indices[j]);
+    }
+    index_.destroy_context(&idx_ctx);
+  }
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
index 5b04c384b..ac2eb06d8 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
@@ -19,7 +19,7 @@ URL := https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-e
 INPUT_FILE := natural-earth_cities_geo.parquet
 PYTHON_SCRIPT := ../gen_points.py
 OUTPUT_POINTS := generated_points.parquet
-NUM_POINTS := 1000
+NUM_POINTS := 10000
 
 .PHONY: all clean generate
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet
index 4ad348b3a..024547360 100644
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet and b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
index 147a332bd..f154c4416 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
@@ -19,7 +19,7 @@ URL := https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-e
 INPUT_FILE := natural-earth_countries_geo.parquet
 PYTHON_SCRIPT := ../gen_points.py
 OUTPUT_POINTS := generated_points.parquet
-NUM_POINTS := 1000
+NUM_POINTS := 10000
 
 .PHONY: all clean generate
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet
index 32d8dcc27..70af40443 100644
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet and b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py b/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
index a02f4a094..b23a89ebc 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
@@ -47,7 +47,7 @@ def calculate_bbox_and_generate_points(geoparquet_path, n_points, output_path):
 
     # Generate random coordinates
     random_x = np.random.uniform(minx, maxx, n_points)
-    random_y = np.random.uniform(miny, miny, n_points)
+    random_y = np.random.uniform(miny, maxy, n_points)
 
     # 4. Create a GeoDataFrame from the points
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu
new file mode 100644
index 000000000..42f5769e2
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu
@@ -0,0 +1,300 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "array_stream.hpp"
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "test_common.hpp"
+
+#include <geos/geom/Envelope.h>
+#include <geos/index/ItemVisitor.h>
+#include <geos/index/strtree/STRtree.h>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <numeric>  // For std::iota
+#include <random>
+#include <vector>
+
+namespace gpuspatial {
+template <typename T>
+struct SpatialIndexTest : public ::testing::Test {
+  using index_t = RTSpatialIndex<typename T::scalar_t, T::n_dim>;
+  std::shared_ptr<RTEngine> rt_engine;
+  index_t index;
+
+  SpatialIndexTest() {
+    auto ptx_root = TestUtils::GetTestShaderPath();
+
+    rt_engine = std::make_shared<RTEngine>();
+    rt_engine->Init(get_default_rt_config(ptx_root));
+    RTSpatialIndexConfig config;
+    config.rt_engine = rt_engine;
+    index = std::move(index_t(config));
+  }
+};
+using PointTypes = ::testing::Types<Point<float, 2>, Point<double, 2>>;
+TYPED_TEST_SUITE(SpatialIndexTest, PointTypes);
+
+template <typename POINT_T>
+std::vector<Box<POINT_T>> GeneratePoints(size_t n, std::mt19937& rng) {
+  using scalar_t = typename POINT_T::scalar_t;
+  std::vector<Box<POINT_T>> rects(n);
+
+  for (size_t i = 0; i < n; i++) {
+    POINT_T p;
+    for (int dim = 0; dim < POINT_T::n_dim; dim++) {
+      std::uniform_real_distribution<scalar_t> dist(-180.0, 180.0);
+      p.set_coordinate(dim, dist(rng));
+    }
+    rects[i] = Box<POINT_T>(p, p);
+  }
+  return rects;
+}
+
+template <typename POINT_T>
+std::vector<Box<POINT_T>> GenerateRects(size_t n, std::mt19937& rng) {
+  using scalar_t = typename POINT_T::scalar_t;
+  std::vector<Box<POINT_T>> rects(n);
+  std::uniform_real_distribution<scalar_t> distSize(0.0, 100);
+
+  for (size_t i = 0; i < n; ++i) {
+    POINT_T min_pt, max_pt, size_pt;
+
+    for (int dim = 0; dim < POINT_T::n_dim; dim++) {
+      std::uniform_real_distribution<scalar_t> dist(-180.0, 180.0);
+      min_pt.set_coordinate(dim, dist(rng));
+      size_pt.set_coordinate(dim, distSize(rng));
+    }
+    max_pt = min_pt + size_pt;
+    rects[i] = Box<POINT_T>(min_pt, max_pt);
+  }
+  return rects;
+}
+
+template <typename POINT_T>
+void ComputeReference(const std::vector<Box<POINT_T>>& build,
+                      const std::vector<Box<POINT_T>>& probe,
+                      std::vector<uint32_t>& build_indices,
+                      std::vector<uint32_t>& probe_indices) {
+  geos::index::strtree::STRtree tree;
+
+  // FIX: Create a storage container for envelopes that persists
+  // for the lifetime of the tree usage.
+  std::vector<geos::geom::Envelope> build_envelopes;
+  build_envelopes.reserve(build.size());
+
+  // 2. Build Phase
+  for (uint32_t j = 0; j < build.size(); j++) {
+    auto min_corner = build[j].get_min();
+    auto max_corner = build[j].get_max();
+
+    // Emplace the envelope into our persistent vector
+    build_envelopes.emplace_back(min_corner.x(), max_corner.x(), min_corner.y(),
+                                 max_corner.y());
+
+    // Pass the address of the element inside the vector
+    // Note: We reserved memory above, so pointers shouldn't be invalidated by resizing
+    tree.insert(&build_envelopes.back(),
+                reinterpret_cast<void*>(static_cast<uintptr_t>(j)));
+  }
+
+  tree.build();
+
+  // 3. Define Visitor (No changes needed here)
+  class InteractionVisitor : public geos::index::ItemVisitor {
+   public:
+    const std::vector<Box<POINT_T>>* build;
+    const std::vector<Box<POINT_T>>* probe;
+    std::vector<uint32_t>* b_indices;
+    std::vector<uint32_t>* p_indices;
+    uint32_t current_probe_idx;
+
+    void visitItem(void* item) override {
+      uintptr_t build_idx_ptr = reinterpret_cast<uintptr_t>(item);
+      uint32_t build_idx = static_cast<uint32_t>(build_idx_ptr);
+
+      // Refinement step
+      if ((*build)[build_idx].intersects((*probe)[current_probe_idx])) {
+        b_indices->push_back(build_idx);
+        p_indices->push_back(current_probe_idx);
+      }
+    }
+  };
+
+  InteractionVisitor visitor;
+  visitor.build = &build;
+  visitor.probe = &probe;
+  visitor.b_indices = &build_indices;
+  visitor.p_indices = &probe_indices;
+
+  // 4. Probe Phase
+  for (uint32_t i = 0; i < probe.size(); i++) {
+    auto min_corner = probe[i].get_min();
+    auto max_corner = probe[i].get_max();
+
+    // It is safe to create this on the stack here because `query`
+    // finishes executing before `search_env` goes out of scope.
+    geos::geom::Envelope search_env(min_corner.x(), max_corner.x(), min_corner.y(),
+                                    max_corner.y());
+
+    visitor.current_probe_idx = i;
+    tree.query(&search_env, visitor);
+  }
+}
+
+template <typename T, typename U>
+void sort_vectors(std::vector<T>& v1, std::vector<U>& v2) {
+  if (v1.size() != v2.size()) return;
+
+  // 1. Create indices [0, 1, 2, ..., N-1]
+  std::vector<size_t> p(v1.size());
+  std::iota(p.begin(), p.end(), 0);
+
+  // 2. Sort indices based on comparing values in v1 and v2
+  std::sort(p.begin(), p.end(), [&](size_t i, size_t j) {
+    if (v1[i] != v1[j]) return v1[i] < v1[j];  // Primary sort by v1
+    return v2[i] < v2[j];                      // Secondary sort by v2
+  });
+
+  // 3. Apply permutation (Reorder v1 and v2 based on sorted indices)
+  // Note: Doing this in-place with O(1) space is complex;
+  // using auxiliary O(N) space is standard.
+  std::vector<T> sorted_v1, sorted_v2;
+  sorted_v1.reserve(v1.size());
+  sorted_v2.reserve(v2.size());
+
+  for (size_t i : p) {
+    sorted_v1.push_back(v1[i]);
+    sorted_v2.push_back(v2[i]);
+  }
+
+  v1 = std::move(sorted_v1);
+  v2 = std::move(sorted_v2);
+}
+
+TYPED_TEST(SpatialIndexTest, PointPoint) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto points1 = GeneratePoints<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(points1.data(), points1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto points2 = GeneratePoints<point_t>(j, gen);
+
+      size_t count = static_cast<size_t>(points1.size() * 0.2);
+
+      // 2. Define the starting point (the last 'count' elements)
+      auto start_it = points1.end() - count;
+
+      // 3. Append to the second vector
+      points2.insert(points2.end(), start_it, points1.end());
+
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(points2.data(), points2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(points1, points2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, BoxPoint) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto rects1 = GenerateRects<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(rects1.data(), rects1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto points2 = GeneratePoints<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(points2.data(), points2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(rects1, points2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, PointBox) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto points1 = GeneratePoints<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(points1.data(), points1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto rects2 = GenerateRects<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(rects2.data(), rects2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(points1, rects2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, BoxBox) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto rects1 = GenerateRects<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(rects1.data(), rects1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto rects2 = GenerateRects<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(rects2.data(), rects2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(rects1, rects2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu
deleted file mode 100644
index bbf415592..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu
+++ /dev/null
@@ -1,438 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "array_stream.hpp"
-#include "gpuspatial/index/spatial_joiner.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "test_common.hpp"
-
-#include "geoarrow_geos/geoarrow_geos.hpp"
-#include "nanoarrow/nanoarrow.hpp"
-
-#include <geoarrow/geoarrow.h>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include <numeric>  // For std::iota
-
-namespace gpuspatial {
-// Function to read a single Parquet file and extract a column.
-static arrow::Status ReadParquetFromFile(
-    arrow::fs::FileSystem* fs,     // 1. Filesystem pointer (e.g., LocalFileSystem)
-    const std::string& file_path,  // 2. Single file path instead of a folder
-    int64_t batch_size, const char* column_name,
-    std::vector<std::shared_ptr<arrow::Array>>& out_arrays) {
-  // 1. Get FileInfo for the single path
-  ARROW_ASSIGN_OR_RAISE(auto file_info, fs->GetFileInfo(file_path));
-
-  // Check if the path points to a file
-  if (file_info.type() != arrow::fs::FileType::File) {
-    return arrow::Status::Invalid("Path is not a file: ", file_path);
-  }
-
-  std::cout << "--- Processing Parquet file: " << file_path << " ---" << std::endl;
-
-  // 2. Open the input file
-  ARROW_ASSIGN_OR_RAISE(auto input_file, fs->OpenInputFile(file_info));
-
-  // 3. Open the Parquet file and create an Arrow reader
-  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, parquet::arrow::OpenFile(
-                                               input_file, arrow::default_memory_pool()));
-
-  // 4. Set the batch size
-  arrow_reader->set_batch_size(batch_size);
-
-  // 5. Get the RecordBatchReader
-  auto rb_reader = arrow_reader->GetRecordBatchReader().ValueOrDie();
-  // 6. Read all record batches and extract the column
-  while (true) {
-    std::shared_ptr<arrow::RecordBatch> batch;
-
-    // Read the next batch
-    ARROW_THROW_NOT_OK(rb_reader->ReadNext(&batch));
-
-    // Check for end of stream
-    if (!batch) {
-      break;
-    }
-
-    // Extract the specified column and add to the output vector
-    std::shared_ptr<arrow::Array> column_array = batch->GetColumnByName(column_name);
-    if (!column_array) {
-      return arrow::Status::Invalid("Column not found: ", column_name);
-    }
-    out_arrays.push_back(column_array);
-  }
-
-  std::cout << "Finished reading. Total arrays extracted: " << out_arrays.size()
-            << std::endl;
-  return arrow::Status::OK();
-}
-
-using GeosBinaryPredicateFn = char (*)(GEOSContextHandle_t, const GEOSGeometry*,
-                                       const GEOSGeometry*);
-static GeosBinaryPredicateFn GetGeosPredicateFn(Predicate predicate) {
-  switch (predicate) {
-    case Predicate::kContains:
-      return &GEOSContains_r;
-    case Predicate::kIntersects:
-      return &GEOSIntersects_r;
-    case Predicate::kWithin:
-      return &GEOSWithin_r;
-    case Predicate::kEquals:
-      return &GEOSEquals_r;
-    case Predicate::kTouches:
-      return &GEOSTouches_r;
-    default:
-      throw std::out_of_range("Unsupported GEOS predicate enumeration value.");
-  }
-}
-
-void TestJoiner(const std::string& build_parquet_path,
-                const std::string& stream_parquet_path, Predicate predicate,
-                int batch_size = 10) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-  SpatialJoiner::SpatialJoinerConfig config;
-  std::string ptx_root = TestUtils::GetTestShaderPath();
-
-  config.ptx_root = ptx_root.c_str();
-  SpatialJoiner spatial_joiner;
-
-  spatial_joiner.Init(&config);
-  spatial_joiner.Clear();
-
-  geoarrow::geos::ArrayReader reader;
-
-  class GEOSCppHandle {
-   public:
-    GEOSContextHandle_t handle;
-
-    GEOSCppHandle() { handle = GEOS_init_r(); }
-
-    ~GEOSCppHandle() { GEOS_finish_r(handle); }
-  };
-  GEOSCppHandle handle;
-
-  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
-
-  geoarrow::geos::GeometryVector geom_build(handle.handle);
-
-  auto get_total_length = [](const std::vector<std::shared_ptr<arrow::Array>>& arrays) {
-    size_t total_length = 0;
-    for (const auto& array : arrays) {
-      total_length += array->length();
-    }
-    return total_length;
-  };
-
-  std::vector<std::shared_ptr<arrow::Array>> build_arrays;
-  ARROW_THROW_NOT_OK(ReadParquetFromFile(fs.get(), build_parquet_path, batch_size,
-                                         "geometry", build_arrays));
-
-  // Using GEOS for reference
-  geom_build.resize(get_total_length(build_arrays));
-  size_t tail_build = 0;
-  auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
-
-  for (auto& array : build_arrays) {
-    nanoarrow::UniqueArray unique_array;
-    nanoarrow::UniqueSchema unique_schema;
-
-    ARROW_THROW_NOT_OK(
-        arrow::ExportArray(*array, unique_array.get(), unique_schema.get()));
-
-    spatial_joiner.PushBuild(unique_schema.get(), unique_array.get(), 0,
-                             unique_array->length);
-
-    // geos for reference
-    size_t n_build;
-
-    ASSERT_EQ(reader.Read(unique_array.get(), 0, unique_array->length,
-                          geom_build.mutable_data() + tail_build, &n_build),
-              GEOARROW_GEOS_OK);
-
-    for (size_t offset = tail_build; offset < tail_build + n_build; offset++) {
-      auto* geom = geom_build.borrow(offset);
-      auto* box = GEOSEnvelope_r(handle.handle, geom);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
-      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
-      GEOSGeom_destroy_r(handle.handle, box);
-    }
-    tail_build += n_build;
-  }
-  spatial_joiner.FinishBuilding();
-  ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
-
-  std::vector<std::shared_ptr<arrow::Array>> stream_arrays;
-  ARROW_THROW_NOT_OK(ReadParquetFromFile(
-      fs.get(), stream_parquet_path, batch_size, "geometry", stream_arrays));
-  int array_index_offset = 0;
-  auto context = spatial_joiner.CreateContext();
-
-  for (auto& array : stream_arrays) {
-    nanoarrow::UniqueArray unique_array;
-    nanoarrow::UniqueSchema unique_schema;
-
-    ARROW_THROW_NOT_OK(
-        arrow::ExportArray(*array, unique_array.get(), unique_schema.get()));
-    std::vector<uint32_t> build_indices, stream_indices;
-
-    spatial_joiner.PushStream(context.get(), unique_schema.get(), unique_array.get(), 0,
-                              unique_array->length, predicate, &build_indices,
-                              &stream_indices, array_index_offset);
-
-    geoarrow::geos::GeometryVector geom_stream(handle.handle);
-    size_t n_stream;
-    geom_stream.resize(array->length());
-    ASSERT_EQ(reader.Read(unique_array.get(), 0, unique_array->length,
-                          geom_stream.mutable_data(), &n_stream),
-              GEOARROW_GEOS_OK);
-    struct Payload {
-      GEOSContextHandle_t handle;
-      const GEOSGeometry* geom;
-      int64_t stream_index_offset;
-      std::vector<uint32_t> build_indices;
-      std::vector<uint32_t> stream_indices;
-      Predicate predicate;
-    };
-
-    Payload payload;
-    payload.predicate = predicate;
-    payload.handle = handle.handle;
-
-    payload.stream_index_offset = array_index_offset;
-
-    for (size_t offset = 0; offset < n_stream; offset++) {
-      auto* geom = geom_stream.borrow(offset);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
-      payload.geom = geom;
-
-      GEOSSTRtree_query_r(
-          handle.handle, tree, geom,
-          [](void* item, void* data) {
-            auto* geom_build = (GEOSGeometry*)item;
-            auto* payload = (Payload*)data;
-            auto* geom_stream = payload->geom;
-
-            if (GetGeosPredicateFn(payload->predicate)(payload->handle, geom_build,
-                                                       geom_stream) == 1) {
-              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
-              auto stream_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
-              payload->build_indices.push_back(build_id);
-              payload->stream_indices.push_back(payload->stream_index_offset + stream_id);
-            }
-          },
-          (void*)&payload);
-    }
-
-    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
-    ASSERT_EQ(payload.stream_indices.size(), stream_indices.size());
-    sort_vectors_by_index(payload.build_indices, payload.stream_indices);
-    sort_vectors_by_index(build_indices, stream_indices);
-    for (size_t j = 0; j < build_indices.size(); j++) {
-      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
-      ASSERT_EQ(payload.stream_indices[j], stream_indices[j]);
-    }
-    array_index_offset += array->length();
-  }
-  GEOSSTRtree_destroy_r(handle.handle, tree);
-}
-
-TEST(JoinerTest, PIPContainsParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(poly_path, point_path, Predicate::kContains, 10);
-  }
-}
-
-TEST(JoinerTest, PIPWithinParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(point_path, poly_path, Predicate::kWithin, 10);
-  }
-}
-
-TEST(JoinerTest, PolyPointIntersectsParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(point_path, poly_path, Predicate::kIntersects, 10);
-  }
-}
-
-TEST(JoinerTest, PolygonPolygonContains) {
-  SpatialJoiner::SpatialJoinerConfig config;
-  std::string ptx_root = TestUtils::GetTestShaderPath();
-  config.ptx_root = ptx_root.c_str();
-  SpatialJoiner spatial_joiner;
-
-  nanoarrow::UniqueArrayStream poly1_stream, poly2_stream;
-
-  auto poly1_path = TestUtils::GetTestDataPath("arrowipc/test_polygons1.arrows");
-  auto poly2_path = TestUtils::GetTestDataPath("arrowipc/test_polygons2.arrows");
-
-  ArrayStreamFromIpc(poly1_path, "geometry", poly1_stream.get());
-  ArrayStreamFromIpc(poly2_path, "geometry", poly2_stream.get());
-
-  nanoarrow::UniqueSchema build_schema, stream_schema;
-  nanoarrow::UniqueArray build_array, stream_array;
-  ArrowError error;
-  ArrowErrorSet(&error, "");
-  int n_row_groups = 100;
-  int array_index_offset = 0;
-  std::vector<uint32_t> build_indices, stream_indices;
-  geoarrow::geos::ArrayReader reader;
-
-  class GEOSCppHandle {
-   public:
-    GEOSContextHandle_t handle;
-
-    GEOSCppHandle() { handle = GEOS_init_r(); }
-
-    ~GEOSCppHandle() { GEOS_finish_r(handle); }
-  };
-  GEOSCppHandle handle;
-
-  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
-
-  geoarrow::geos::GeometryVector geom_polygons1(handle.handle);
-  geoarrow::geos::GeometryVector geom_polygons2(handle.handle);
-  struct Payload {
-    GEOSContextHandle_t handle;
-    const GEOSGeometry* geom;
-    int64_t build_index_offset;
-    int64_t stream_index_offset;
-    std::vector<int64_t> build_indices;
-    std::vector<int64_t> stream_indices;
-  };
-
-  int64_t build_count = 0;
-  spatial_joiner.Init(&config);
-  for (int i = 0; i < n_row_groups; i++) {
-    ASSERT_EQ(ArrowArrayStreamGetNext(poly1_stream.get(), build_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(poly1_stream.get(), build_schema.get(), &error),
-              NANOARROW_OK);
-
-    ASSERT_EQ(ArrowArrayStreamGetNext(poly2_stream.get(), stream_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(poly2_stream.get(), stream_schema.get(), &error),
-              NANOARROW_OK);
-
-    spatial_joiner.Clear();
-    spatial_joiner.PushBuild(nullptr, build_array.get(), 0, build_array->length);
-    auto context = spatial_joiner.CreateContext();
-
-    build_indices.clear();
-    stream_indices.clear();
-    spatial_joiner.FinishBuilding();
-    spatial_joiner.PushStream(context.get(), nullptr, stream_array.get(), 0,
-                              stream_array->length, Predicate::kContains, &build_indices,
-                              &stream_indices, array_index_offset);
-    geom_polygons1.resize(build_array->length);
-    geom_polygons2.resize(stream_array->length);
-
-    size_t n_polygons1 = 0, n_polygons2 = 0;
-    ASSERT_EQ(reader.Read(build_array.get(), 0, build_array->length,
-                          geom_polygons1.mutable_data(), &n_polygons1),
-              GEOARROW_GEOS_OK);
-    ASSERT_EQ(reader.Read(stream_array.get(), 0, stream_array->length,
-                          geom_polygons2.mutable_data(), &n_polygons2),
-              GEOARROW_GEOS_OK);
-
-    auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
-
-    for (size_t j = 0; j < n_polygons1; j++) {
-      auto* geom_polygon = geom_polygons1.borrow(j);
-      auto* box = GEOSEnvelope_r(handle.handle, geom_polygon);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom_polygon, (void*)j);
-      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom_polygon);
-      GEOSGeom_destroy_r(handle.handle, box);
-    }
-    ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
-
-    Payload payload;
-    payload.handle = handle.handle;
-
-    payload.build_index_offset = build_count;
-    payload.stream_index_offset = array_index_offset;
-
-    for (size_t j = 0; j < n_polygons2; j++) {
-      auto* geom_poly2 = geom_polygons2.borrow(j);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom_poly2, (void*)j);
-
-      payload.geom = geom_poly2;
-
-      GEOSSTRtree_query_r(
-          handle.handle, tree, geom_poly2,
-          [](void* item, void* data) {
-            auto* polygon1 = (GEOSGeometry*)item;
-            auto* payload = (Payload*)data;
-            auto* polygon2 = payload->geom;
-
-            if (GEOSContains_r(payload->handle, polygon1, polygon2) == 1) {
-              auto polygon1_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, polygon1);
-              auto polygon2_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, polygon2);
-              payload->build_indices.push_back(payload->build_index_offset + polygon1_id);
-              payload->stream_indices.push_back(payload->stream_index_offset +
-                                                polygon2_id);
-            }
-          },
-          (void*)&payload);
-    }
-
-    GEOSSTRtree_destroy_r(handle.handle, tree);
-
-    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
-
-    build_count += build_array->length;
-    array_index_offset += stream_array->length;
-  }
-}
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
index f8a762974..bb60bad87 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
@@ -15,13 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 #include "array_stream.hpp"
-#include "gpuspatial/geom/geometry_collection.cuh"
-#include "gpuspatial/geom/multi_polygon.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/utils/pinned_vector.h"
+#include "gpuspatial/geom/geometry_collection.hpp"
+#include "gpuspatial/geom/multi_polygon.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/utils/pinned_vector.hpp"
 #include "nanoarrow/nanoarrow.hpp"
 
-#include "gpuspatial/geom/multi_point.cuh"
+#include "gpuspatial/geom/multi_point.hpp"
 #include "test_common.hpp"
 
 #include <geoarrow/geoarrow.h>
@@ -34,7 +34,7 @@
 #include <iomanip>
 #include <iostream>
 #include <vector>
-#include "gpuspatial/loader/parallel_wkb_loader.h"
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
 namespace gpuspatial {
 
 template <typename T>
@@ -45,6 +45,7 @@ TYPED_TEST(WKBLoaderTest, Point) {
   using point_t = typename TypeParam::first_type;
   using index_t = typename TypeParam::second_type;
   nanoarrow::UniqueArrayStream stream;
+  nanoarrow::UniqueSchema schema;
   ArrayStreamFromWKT({{"POINT (0 0)"},
                       {"POINT (10 20)", "POINT (-5.5 -12.3)"},
                       {"POINT (100 -50)", "POINT (3.1415926535 2.7182818284)",
@@ -62,11 +63,14 @@ TYPED_TEST(WKBLoaderTest, Point) {
     nanoarrow::UniqueArray array;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -103,13 +107,17 @@ TYPED_TEST(WKBLoaderTest, MultiPoint) {
 
   while (1) {
     nanoarrow::UniqueArray array;
+    nanoarrow::UniqueSchema schema;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -145,6 +153,7 @@ TYPED_TEST(WKBLoaderTest, PointMultiPoint) {
   using point_t = typename TypeParam::first_type;
   using index_t = typename TypeParam::second_type;
   nanoarrow::UniqueArrayStream stream;
+  nanoarrow::UniqueSchema schema;
   ArrayStreamFromWKT({{"POINT (1 2)", "MULTIPOINT ((3 4), (5 6))"},
                       {"POINT (7 8)", "MULTIPOINT ((9 10))"},
                       {"MULTIPOINT EMPTY", "POINT (11 12)"}},
@@ -158,11 +167,14 @@ TYPED_TEST(WKBLoaderTest, PointMultiPoint) {
     nanoarrow::UniqueArray array;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -207,6 +219,7 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderWithHoles) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
@@ -215,9 +228,12 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderWithHoles) {
 
   loader.Init();
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto geometries = loader.Finish(cuda_stream);
 
   auto points = TestUtils::ToVector(cuda_stream, geometries.get_points());
@@ -327,17 +343,21 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygon) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
   ParallelWkbLoader<point_t, index_t> loader;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
 
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
@@ -431,6 +451,7 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygonLocate) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
@@ -438,9 +459,12 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygonLocate) {
   rmm::cuda_stream cuda_stream;
 
   loader.Init();
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
 
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
@@ -498,18 +522,21 @@ TYPED_TEST(WKBLoaderTest, MixTypes) {
       },
       GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
   ParallelWkbLoader<point_t, index_t> loader;
 
   loader.Init();
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
 
@@ -598,19 +625,22 @@ TYPED_TEST(WKBLoaderTest, GeomCollection) {
         "MULTIPOLYGON(((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 30, 15 5), (20 15, 35 15, 35 25, 20 25, 20 15)))"}},
       GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
   ParallelWkbLoader<point_t, index_t> loader;
   typename ParallelWkbLoader<point_t, index_t>::Config config;
 
   loader.Init(config);
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto geometries = loader.Finish(cuda_stream);
 
   const auto& offsets = geometries.get_offsets();
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/main.cc b/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
index a8b3c21f3..f89c68fcf 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
@@ -17,6 +17,8 @@
 #include <filesystem>  // Requires C++17
 #include <iostream>
 #include <string>
+
+#include "gpuspatial_testing.hpp"
 #include "gtest/gtest.h"
 
 namespace TestUtils {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu
new file mode 100644
index 000000000..3a48e2e89
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu
@@ -0,0 +1,738 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "array_stream.hpp"
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/loader/device_geometries.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "test_common.hpp"
+
+#include "geoarrow_geos/geoarrow_geos.hpp"
+#include "nanoarrow/nanoarrow.hpp"
+
+#include <geoarrow/geoarrow.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <numeric>  // For std::iota
+
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "gpuspatial/refine/rt_spatial_refiner.cuh"
+
+namespace gpuspatial {
+// Function to read a single Parquet file and extract a column.
+static arrow::Status ReadParquetFromFile(
+    arrow::fs::FileSystem* fs,     // 1. Filesystem pointer (e.g., LocalFileSystem)
+    const std::string& file_path,  // 2. Single file path instead of a folder
+    int64_t batch_size, const char* column_name,
+    std::vector<std::shared_ptr<arrow::Array>>& out_arrays) {
+  // 1. Get FileInfo for the single path
+  ARROW_ASSIGN_OR_RAISE(auto file_info, fs->GetFileInfo(file_path));
+
+  // Check if the path points to a file
+  if (file_info.type() != arrow::fs::FileType::File) {
+    return arrow::Status::Invalid("Path is not a file: ", file_path);
+  }
+
+  std::cout << "--- Processing Parquet file: " << file_path << " ---" << std::endl;
+
+  // 2. Open the input file
+  ARROW_ASSIGN_OR_RAISE(auto input_file, fs->OpenInputFile(file_info));
+
+  // 3. Open the Parquet file and create an Arrow reader
+  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, parquet::arrow::OpenFile(
+                                               input_file, arrow::default_memory_pool()));
+
+  // 4. Set the batch size
+  arrow_reader->set_batch_size(batch_size);
+
+  // 5. Get the RecordBatchReader
+  auto rb_reader = arrow_reader->GetRecordBatchReader().ValueOrDie();
+  // 6. Read all record batches and extract the column
+  while (true) {
+    std::shared_ptr<arrow::RecordBatch> batch;
+
+    // Read the next batch
+    ARROW_THROW_NOT_OK(rb_reader->ReadNext(&batch));
+
+    // Check for end of stream
+    if (!batch) {
+      break;
+    }
+
+    // Extract the specified column and add to the output vector
+    std::shared_ptr<arrow::Array> column_array = batch->GetColumnByName(column_name);
+    if (!column_array) {
+      return arrow::Status::Invalid("Column not found: ", column_name);
+    }
+    out_arrays.push_back(column_array);
+  }
+
+  std::cout << "Finished reading. Total arrays extracted: " << out_arrays.size()
+            << std::endl;
+  return arrow::Status::OK();
+}
+
+// Helper to concatenate C-style ArrowArrays
+arrow::Result<std::shared_ptr<arrow::Array>> ConcatCArrays(
+    const std::vector<ArrowArray*>& c_arrays, ArrowSchema* c_schema) {
+  // 1. Import the schema ONCE into a C++ DataType object.
+  //    This effectively "consumes" c_schema.
+  ARROW_ASSIGN_OR_RAISE(auto type, arrow::ImportType(c_schema));
+
+  arrow::ArrayVector arrays_to_concat;
+  arrays_to_concat.reserve(c_arrays.size());
+
+  // 2. Loop through arrays using the C++ type object.
+  for (ArrowArray* c_arr : c_arrays) {
+    // Use the ImportArray overload that takes std::shared_ptr<DataType>.
+    // This validates c_arr against 'type' without consuming 'type'.
+    ARROW_ASSIGN_OR_RAISE(auto arr, arrow::ImportArray(c_arr, type));
+    arrays_to_concat.push_back(arr);
+  }
+
+  return arrow::Concatenate(arrays_to_concat);
+}
+
+using GeosBinaryPredicateFn = char (*)(GEOSContextHandle_t, const GEOSGeometry*,
+                                       const GEOSGeometry*);
+
+static GeosBinaryPredicateFn GetGeosPredicateFn(Predicate predicate) {
+  switch (predicate) {
+    case Predicate::kContains:
+      return &GEOSContains_r;
+    case Predicate::kIntersects:
+      return &GEOSIntersects_r;
+    case Predicate::kWithin:
+      return &GEOSWithin_r;
+    case Predicate::kEquals:
+      return &GEOSEquals_r;
+    case Predicate::kTouches:
+      return &GEOSTouches_r;
+    default:
+      throw std::out_of_range("Unsupported GEOS predicate enumeration value.");
+  }
+}
+
+std::vector<std::shared_ptr<arrow::Array>> ReadParquet(const std::string& path,
+                                                       int batch_size = 100) {
+  using namespace TestUtils;
+
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::shared_ptr<arrow::Array>> build_arrays;
+  ARROW_THROW_NOT_OK(
+      ReadParquetFromFile(fs.get(), path, batch_size, "geometry", build_arrays));
+  return build_arrays;
+}
+
+void ReadArrowIPC(const std::string& path, std::vector<nanoarrow::UniqueArray>& arrays,
+                  std::vector<nanoarrow::UniqueSchema>& schemas,
+                  uint32_t limit = std::numeric_limits<uint32_t>::max()) {
+  nanoarrow::UniqueArrayStream stream;
+  ArrowError error;
+
+  // Assuming this helper exists in your context or you implement it via Arrow C++
+  // (It populates the C-stream from the file)
+  ArrayStreamFromIpc(path, "geometry", stream.get());
+  uint32_t count = 0;
+  while (true) {
+    // 1. Create fresh objects for this iteration
+    nanoarrow::UniqueArray array;
+    nanoarrow::UniqueSchema schema;
+
+    // 2. Get the next batch
+    // Note: This function expects 'array' to be empty/released.
+    int code = ArrowArrayStreamGetNext(stream.get(), array.get(), &error);
+    if (code != NANOARROW_OK) {
+      // Handle error (log or throw)
+      break;
+    }
+
+    // 3. CHECK END OF STREAM
+    // If release is NULL, the stream is finished.
+    if (array->release == nullptr) {
+      break;
+    }
+
+    // 4. Get the schema for this specific batch
+    // ArrowArrayStreamGetSchema creates a deep copy of the schema into 'schema'.
+    code = ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error);
+    if (code != NANOARROW_OK) {
+      // Handle error
+      break;
+    }
+
+    // 5. Move ownership to the output vectors
+    arrays.push_back(std::move(array));
+    schemas.push_back(std::move(schema));
+    count += array->length;
+    if (count >= limit) break;
+  }
+}
+
+void TestJoiner(ArrowSchema* build_schema, std::vector<ArrowArray*>& build_arrays,
+                ArrowSchema* probe_schema, std::vector<ArrowArray*>& probe_arrays,
+                Predicate predicate) {
+  using namespace TestUtils;
+  using coord_t = double;
+  using fpoint_t = Point<coord_t, 2>;
+  using box_t = Box<fpoint_t>;
+
+  auto rt_engine = std::make_shared<RTEngine>();
+
+  {
+    std::string ptx_root = GetTestShaderPath();
+    auto config = get_default_rt_config(ptx_root);
+    rt_engine->Init(config);
+  }
+
+  RTSpatialIndexConfig idx_config;
+  idx_config.rt_engine = rt_engine;
+  auto rt_index = CreateRTSpatialIndex<coord_t, 2>(idx_config);
+  RTSpatialRefinerConfig refiner_config;
+  refiner_config.rt_engine = rt_engine;
+  auto rt_refiner = CreateRTSpatialRefiner(refiner_config);
+
+  geoarrow::geos::ArrayReader reader;
+
+  class GEOSCppHandle {
+   public:
+    GEOSContextHandle_t handle;
+
+    GEOSCppHandle() { handle = GEOS_init_r(); }
+
+    ~GEOSCppHandle() { GEOS_finish_r(handle); }
+  };
+  GEOSCppHandle handle;
+
+  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
+
+  geoarrow::geos::GeometryVector geom_build(handle.handle);
+  size_t total_build_length = 0;
+
+  for (auto& array : build_arrays) {
+    total_build_length += array->length;
+  }
+
+  // Using GEOS for reference
+  geom_build.resize(total_build_length);
+  size_t tail_build = 0;
+  auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
+  for (auto& array : build_arrays) {
+    // geos for reference
+    size_t n_build;
+
+    ASSERT_EQ(reader.Read((ArrowArray*)array, 0, array->length,
+                          geom_build.mutable_data() + tail_build, &n_build),
+              GEOARROW_GEOS_OK);
+    ASSERT_EQ(array->length, n_build);
+    std::vector<box_t> rects;
+
+    for (size_t offset = tail_build; offset < tail_build + n_build; offset++) {
+      auto* geom = geom_build.borrow(offset);
+      auto* box = GEOSEnvelope_r(handle.handle, geom);
+
+      double xmin, ymin, xmax, ymax;
+      if (GEOSGeom_getExtent_r(handle.handle, box, &xmin, &ymin, &xmax, &ymax) == 0) {
+        printf("Error getting extent\n");
+        xmin = 0;
+        ymin = 0;
+        xmax = -1;
+        ymax = -1;
+      }
+
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+
+      rects.push_back(bbox);
+
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
+      GEOSGeom_destroy_r(handle.handle, box);
+    }
+    rt_index->PushBuild(rects.data(), rects.size());
+    tail_build += n_build;
+  }
+  rt_index->FinishBuilding();
+  ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
+
+  auto build_array_ptr = ConcatCArrays(build_arrays, build_schema).ValueOrDie();
+
+  nanoarrow::UniqueArray uniq_build_array;
+  nanoarrow::UniqueSchema uniq_build_schema;
+  ARROW_THROW_NOT_OK(arrow::ExportArray(*build_array_ptr, uniq_build_array.get(),
+                                        uniq_build_schema.get()));
+  // Start stream processing
+
+  for (auto& array : probe_arrays) {
+    geoarrow::geos::GeometryVector geom_stream(handle.handle);
+    size_t n_stream;
+    geom_stream.resize(array->length);
+
+    ASSERT_EQ(reader.Read(array, 0, array->length, geom_stream.mutable_data(), &n_stream),
+              GEOARROW_GEOS_OK);
+
+    std::vector<box_t> queries;
+
+    for (size_t i = 0; i < array->length; i++) {
+      auto* geom = geom_stream.borrow(i);
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, geom, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      queries.push_back(bbox);
+    }
+
+    std::vector<uint32_t> build_indices, stream_indices;
+
+    rt_index->Probe(queries.data(), queries.size(), &build_indices, &stream_indices);
+    auto old_size = build_indices.size();
+
+    auto new_size = rt_refiner->Refine(
+        uniq_build_schema.get(), uniq_build_array.get(), probe_schema, array, predicate,
+        build_indices.data(), stream_indices.data(), build_indices.size());
+
+    build_indices.resize(new_size);
+    stream_indices.resize(new_size);
+
+    struct Payload {
+      GEOSContextHandle_t handle;
+      const GEOSGeometry* geom;
+      std::vector<uint32_t> build_indices;
+      std::vector<uint32_t> stream_indices;
+      Predicate predicate;
+    };
+
+    Payload payload;
+    payload.predicate = predicate;
+    payload.handle = handle.handle;
+
+    for (size_t offset = 0; offset < n_stream; offset++) {
+      auto* geom = geom_stream.borrow(offset);
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      payload.geom = geom;
+
+      GEOSSTRtree_query_r(
+          handle.handle, tree, geom,
+          [](void* item, void* data) {
+            auto* geom_build = (GEOSGeometry*)item;
+            auto* payload = (Payload*)data;
+            auto* geom_stream = payload->geom;
+
+            if (GetGeosPredicateFn(payload->predicate)(payload->handle, geom_build,
+                                                       geom_stream) == 1) {
+              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
+              auto stream_id =
+                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
+              payload->build_indices.push_back(build_id);
+              payload->stream_indices.push_back(stream_id);
+            }
+          },
+          (void*)&payload);
+    }
+
+    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
+    ASSERT_EQ(payload.stream_indices.size(), stream_indices.size());
+    sort_vectors_by_index(payload.build_indices, payload.stream_indices);
+    sort_vectors_by_index(build_indices, stream_indices);
+    for (size_t j = 0; j < build_indices.size(); j++) {
+      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
+      ASSERT_EQ(payload.stream_indices[j], stream_indices[j]);
+    }
+  }
+  GEOSSTRtree_destroy_r(handle.handle, tree);
+}
+
+void TestJoinerLoaded(ArrowSchema* build_schema, std::vector<ArrowArray*>& build_arrays,
+                      ArrowSchema* probe_schema, std::vector<ArrowArray*>& probe_arrays,
+                      Predicate predicate, bool pipelined = false) {
+  using namespace TestUtils;
+  using coord_t = double;
+  using fpoint_t = Point<coord_t, 2>;
+  using box_t = Box<fpoint_t>;
+
+  auto rt_engine = std::make_shared<RTEngine>();
+  {
+    std::string ptx_root = TestUtils::GetTestShaderPath();
+    auto config = get_default_rt_config(ptx_root);
+    rt_engine->Init(config);
+  }
+
+  RTSpatialIndexConfig idx_config;
+  idx_config.rt_engine = rt_engine;
+  auto rt_index = CreateRTSpatialIndex<coord_t, 2>(idx_config);
+
+  RTSpatialRefinerConfig refiner_config;
+  refiner_config.rt_engine = rt_engine;
+  if (pipelined) {
+    refiner_config.pipeline_batches = 10;
+  }
+  auto rt_refiner = CreateRTSpatialRefiner(refiner_config);
+  geoarrow::geos::ArrayReader reader;
+
+  class GEOSCppHandle {
+   public:
+    GEOSContextHandle_t handle;
+
+    GEOSCppHandle() { handle = GEOS_init_r(); }
+
+    ~GEOSCppHandle() { GEOS_finish_r(handle); }
+  };
+  GEOSCppHandle handle;
+
+  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
+
+  geoarrow::geos::GeometryVector geom_build(handle.handle);
+  size_t total_build_length = 0;
+
+  for (auto& array : build_arrays) {
+    total_build_length += array->length;
+  }
+
+  // Using GEOS for reference
+  geom_build.resize(total_build_length);
+  size_t tail_build = 0;
+  auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
+  for (auto& array : build_arrays) {
+    // geos for reference
+    size_t n_build;
+
+    ASSERT_EQ(reader.Read((ArrowArray*)array, 0, array->length,
+                          geom_build.mutable_data() + tail_build, &n_build),
+              GEOARROW_GEOS_OK);
+    ASSERT_EQ(array->length, n_build);
+    std::vector<box_t> rects;
+
+    for (size_t offset = tail_build; offset < tail_build + n_build; offset++) {
+      auto* geom = geom_build.borrow(offset);
+      auto* box = GEOSEnvelope_r(handle.handle, geom);
+
+      double xmin, ymin, xmax, ymax;
+      if (GEOSGeom_getExtent_r(handle.handle, box, &xmin, &ymin, &xmax, &ymax) == 0) {
+        xmin = 0;
+        ymin = 0;
+        xmax = -1;
+        ymax = -1;
+      }
+
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+
+      rects.push_back(bbox);
+
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
+      GEOSGeom_destroy_r(handle.handle, box);
+    }
+    rt_index->PushBuild(rects.data(), rects.size());
+    tail_build += n_build;
+  }
+  rt_index->FinishBuilding();
+  ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
+
+  auto build_array_ptr = ConcatCArrays(build_arrays, build_schema).ValueOrDie();
+
+  nanoarrow::UniqueArray uniq_build_array;
+  nanoarrow::UniqueSchema uniq_build_schema;
+  ARROW_THROW_NOT_OK(arrow::ExportArray(*build_array_ptr, uniq_build_array.get(),
+                                        uniq_build_schema.get()));
+  // Start stream processing
+
+  rt_refiner->PushBuild(uniq_build_schema.get(), uniq_build_array.get());
+  rt_refiner->FinishBuilding();
+
+  for (auto& array : probe_arrays) {
+    geoarrow::geos::GeometryVector geom_stream(handle.handle);
+    size_t n_stream;
+    geom_stream.resize(array->length);
+
+    ASSERT_EQ(reader.Read(array, 0, array->length, geom_stream.mutable_data(), &n_stream),
+              GEOARROW_GEOS_OK);
+
+    std::vector<box_t> queries;
+
+    for (size_t i = 0; i < array->length; i++) {
+      auto* geom = geom_stream.borrow(i);
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, geom, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      queries.push_back(bbox);
+    }
+
+    std::vector<uint32_t> build_indices, stream_indices;
+
+    rt_index->Probe(queries.data(), queries.size(), &build_indices, &stream_indices);
+    auto old_size = build_indices.size();
+
+    auto new_size =
+        rt_refiner->Refine(probe_schema, array, predicate, build_indices.data(),
+                           stream_indices.data(), build_indices.size());
+
+    printf("Old size %u, new size %u\n", (unsigned)old_size, (unsigned)new_size);
+    build_indices.resize(new_size);
+    stream_indices.resize(new_size);
+
+    struct Payload {
+      GEOSContextHandle_t handle;
+      const GEOSGeometry* geom;
+      std::vector<uint32_t> build_indices;
+      std::vector<uint32_t> stream_indices;
+      Predicate predicate;
+    };
+
+    Payload payload;
+    payload.predicate = predicate;
+    payload.handle = handle.handle;
+
+    for (size_t offset = 0; offset < n_stream; offset++) {
+      auto* geom = geom_stream.borrow(offset);
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      payload.geom = geom;
+
+      GEOSSTRtree_query_r(
+          handle.handle, tree, geom,
+          [](void* item, void* data) {
+            auto* geom_build = (GEOSGeometry*)item;
+            auto* payload = (Payload*)data;
+            auto* geom_stream = payload->geom;
+
+            if (GetGeosPredicateFn(payload->predicate)(payload->handle, geom_build,
+                                                       geom_stream) == 1) {
+              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
+              auto stream_id =
+                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
+              payload->build_indices.push_back(build_id);
+              payload->stream_indices.push_back(stream_id);
+            }
+          },
+          (void*)&payload);
+    }
+
+    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
+    ASSERT_EQ(payload.stream_indices.size(), stream_indices.size());
+    sort_vectors_by_index(payload.build_indices, payload.stream_indices);
+    sort_vectors_by_index(build_indices, stream_indices);
+    for (size_t j = 0; j < build_indices.size(); j++) {
+      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
+      ASSERT_EQ(payload.stream_indices[j], stream_indices[j]);
+    }
+  }
+  GEOSSTRtree_destroy_r(handle.handle, tree);
+}
+
+TEST(JoinerTest, PIPContainsParquet) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoinerLoaded(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+                     point_c_arrays, Predicate::kContains);
+  }
+}
+
+TEST(JoinerTest, PIPContainsParquetLoaded) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoinerLoaded(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+                     point_c_arrays, Predicate::kContains);
+  }
+}
+
+TEST(JoinerTest, PIPContainsParquetPipelined) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoinerLoaded(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+                     point_c_arrays, Predicate::kContains, true);
+  }
+}
+
+TEST(JoinerTest, PIPContainsArrowIPC) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{GetTestDataPath("arrowipc/test_polygons.arrows")};
+  std::vector<std::string> points{GetTestDataPath("arrowipc/test_points.arrows")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    ReadArrowIPC(poly_path, poly_uniq_arrays, poly_uniq_schema);
+    ReadArrowIPC(point_path, point_uniq_arrays, point_uniq_schema);
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+
+    TestJoiner(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+               point_c_arrays, Predicate::kContains);
+  }
+}
+
+TEST(JoinerTest, PIPWithinArrowIPC) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{GetTestDataPath("arrowipc/test_polygons.arrows")};
+  std::vector<std::string> points{GetTestDataPath("arrowipc/test_points.arrows")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    ReadArrowIPC(poly_path, poly_uniq_arrays, poly_uniq_schema);
+    ReadArrowIPC(point_path, point_uniq_arrays, point_uniq_schema);
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+
+    TestJoiner(point_uniq_schema[0].get(), point_c_arrays, poly_uniq_schema[0].get(),
+               poly_c_arrays, Predicate::kWithin);
+  }
+}
+
+TEST(JoinerTest, PolygonPolygonContains) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys1{GetTestDataPath("arrowipc/test_polygons1.arrows")};
+  std::vector<std::string> polys2{GetTestDataPath("arrowipc/test_polygons2.arrows")};
+
+  for (int i = 0; i < polys1.size(); i++) {
+    auto poly1_path = TestUtils::GetTestDataPath(polys1[i]);
+    auto poly2_path = TestUtils::GetCanonicalPath(polys2[i]);
+    std::vector<nanoarrow::UniqueArray> poly1_uniq_arrays, poly2_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly1_uniq_schema, poly2_uniq_schema;
+
+    ReadArrowIPC(poly1_path, poly1_uniq_arrays, poly1_uniq_schema, 100);
+    ReadArrowIPC(poly2_path, poly2_uniq_arrays, poly2_uniq_schema, 100);
+
+    std::vector<ArrowArray*> poly1_c_arrays, poly2_c_arrays;
+    for (auto& arr : poly1_uniq_arrays) {
+      poly1_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : poly2_uniq_arrays) {
+      poly2_c_arrays.push_back(arr.get());
+    }
+
+    TestJoiner(poly1_uniq_schema[0].get(), poly1_c_arrays, poly2_uniq_schema[0].get(),
+               poly2_c_arrays, Predicate::kIntersects);
+  }
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
index fabcd3f5c..2b0ffb9a9 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
@@ -15,9 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #include "array_stream.hpp"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/relate/relate.cuh"
-#include "gpuspatial/utils/pinned_vector.h"
+#include "gpuspatial/loader/parallel_wkb_loader.hpp"
+#include "gpuspatial/relate/relate.hpp"
+#include "gpuspatial/utils/pinned_vector.hpp"
 
 #include "test_common.hpp"
 
@@ -58,15 +58,18 @@ void ParseWKTPoint(const char* wkt, POINT_T& point) {
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   auto h_vec = TestUtils::ToVector(cuda_stream, device_geometries.get_points());
   cuda_stream.synchronize();
@@ -79,15 +82,19 @@ void ParseWKTMultiPoint(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
 
   ctx.prefix_sum1 = TestUtils::ToVector(
@@ -108,15 +115,19 @@ void ParseWKTLineString(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().line_string_offsets.ps_num_points);
@@ -136,15 +147,19 @@ void ParseWKTMultiLineString(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream,
@@ -169,15 +184,19 @@ void ParseWKTPolygon(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().polygon_offsets.ps_num_rings);
@@ -200,15 +219,19 @@ void ParseWKTMultiPolygon(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().multi_polygon_offsets.ps_num_parts);
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp b/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp
index ecd9fd460..0412cf6f9 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/test_common.hpp
@@ -16,9 +16,9 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/utils/array_view.h"
-#include "gpuspatial/utils/pinned_vector.h"
+#include "gpuspatial/geom/point.hpp"
+#include "gpuspatial/utils/array_view.hpp"
+#include "gpuspatial/utils/pinned_vector.hpp"
 
 #include "gtest/gtest.h"
 #include "rmm/cuda_stream_view.hpp"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json b/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
index b162d78e2..f593623e8 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
+++ b/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
@@ -7,6 +7,7 @@
       "dependencies": [
         "gtest",
         "geos",
+        "zstd",
         {
           "name": "arrow",
           "features": [
diff --git a/c/sedona-libgpuspatial/src/error.rs b/c/sedona-libgpuspatial/src/error.rs
index 3530e40e8..d38897019 100644
--- a/c/sedona-libgpuspatial/src/error.rs
+++ b/c/sedona-libgpuspatial/src/error.rs
@@ -24,7 +24,8 @@ pub enum GpuSpatialError {
     Init(String),
     PushBuild(String),
     FinishBuild(String),
-    PushStream(String),
+    Probe(String),
+    Refine(String),
 }
 
 impl From<ArrowError> for GpuSpatialError {
@@ -48,8 +49,11 @@ impl fmt::Display for GpuSpatialError {
             GpuSpatialError::FinishBuild(errmsg) => {
                 write!(f, "Finish building failed: {}", errmsg)
             }
-            GpuSpatialError::PushStream(errmsg) => {
-                write!(f, "Push stream failed: {}", errmsg)
+            GpuSpatialError::Probe(errmsg) => {
+                write!(f, "Probe failed: {}", errmsg)
+            }
+            GpuSpatialError::Refine(errmsg) => {
+                write!(f, "Refine failed: {}", errmsg)
             }
         }
     }
diff --git a/c/sedona-libgpuspatial/src/lib.rs b/c/sedona-libgpuspatial/src/lib.rs
index 1bcd4ef43..5e646ebcb 100644
--- a/c/sedona-libgpuspatial/src/lib.rs
+++ b/c/sedona-libgpuspatial/src/lib.rs
@@ -23,30 +23,44 @@ mod libgpuspatial;
 #[cfg(gpu_available)]
 mod libgpuspatial_glue_bindgen;
 
-// Import Array trait for len() method (used in gpu_available code)
 #[cfg(gpu_available)]
-use arrow_array::Array;
-
+use std::sync::{Arc, Mutex};
+// Import Array trait for len() method (used in gpu_available code)
+use geo::Rect;
 // Re-exports for GPU functionality
 #[cfg(gpu_available)]
 pub use error::GpuSpatialError;
 #[cfg(gpu_available)]
-pub use libgpuspatial::{GpuSpatialJoinerWrapper, GpuSpatialPredicateWrapper};
+pub use libgpuspatial::{
+    GpuSpatialIndexFloat2DWrapper, GpuSpatialRefinerWrapper, GpuSpatialRelationPredicateWrapper,
+    GpuSpatialRuntimeWrapper,
+};
 #[cfg(gpu_available)]
-pub use libgpuspatial_glue_bindgen::GpuSpatialJoinerContext;
+pub use libgpuspatial_glue_bindgen::SedonaSpatialIndexContext;
+#[cfg(gpu_available)]
+use nvml_wrapper::Nvml;
 
 // Mark GPU types as Send for thread safety
 // SAFETY: The GPU library is designed to be used from multiple threads.
 // Each thread gets its own context, and the underlying GPU library handles thread safety.
 // The raw pointers inside are managed by the C++ library which ensures proper synchronization.
 #[cfg(gpu_available)]
-unsafe impl Send for GpuSpatialJoinerContext {}
+unsafe impl Send for SedonaSpatialIndexContext {}
+#[cfg(gpu_available)]
+unsafe impl Send for libgpuspatial_glue_bindgen::GpuSpatialRuntime {}
+#[cfg(gpu_available)]
+unsafe impl Sync for libgpuspatial_glue_bindgen::GpuSpatialRuntime {}
 
 #[cfg(gpu_available)]
-unsafe impl Send for libgpuspatial_glue_bindgen::GpuSpatialJoiner {}
+unsafe impl Send for libgpuspatial_glue_bindgen::SedonaFloatIndex2D {}
+#[cfg(gpu_available)]
+unsafe impl Send for libgpuspatial_glue_bindgen::SedonaSpatialRefiner {}
+
+#[cfg(gpu_available)]
+unsafe impl Sync for libgpuspatial_glue_bindgen::SedonaFloatIndex2D {}
 
 #[cfg(gpu_available)]
-unsafe impl Send for GpuSpatialJoinerWrapper {}
+unsafe impl Sync for libgpuspatial_glue_bindgen::SedonaSpatialRefiner {}
 
 // Error type for non-GPU builds
 #[cfg(not(gpu_available))]
@@ -58,16 +72,77 @@ pub enum GpuSpatialError {
 
 pub type Result<T> = std::result::Result<T, GpuSpatialError>;
 
+/// Spatial predicates for GPU operations
+#[repr(u32)]
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum GpuSpatialRelationPredicate {
+    Equals = 0,
+    Disjoint = 1,
+    Touches = 2,
+    Contains = 3,
+    Covers = 4,
+    Intersects = 5,
+    Within = 6,
+    CoveredBy = 7,
+}
+
+impl std::fmt::Display for GpuSpatialRelationPredicate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            GpuSpatialRelationPredicate::Equals => write!(f, "equals"),
+            GpuSpatialRelationPredicate::Disjoint => write!(f, "disjoint"),
+            GpuSpatialRelationPredicate::Touches => write!(f, "touches"),
+            GpuSpatialRelationPredicate::Contains => write!(f, "contains"),
+            GpuSpatialRelationPredicate::Covers => write!(f, "covers"),
+            GpuSpatialRelationPredicate::Intersects => write!(f, "intersects"),
+            GpuSpatialRelationPredicate::Within => write!(f, "within"),
+            GpuSpatialRelationPredicate::CoveredBy => write!(f, "coveredby"),
+        }
+    }
+}
+
+#[cfg(gpu_available)]
+impl From<GpuSpatialRelationPredicate> for GpuSpatialRelationPredicateWrapper {
+    fn from(pred: GpuSpatialRelationPredicate) -> Self {
+        match pred {
+            GpuSpatialRelationPredicate::Equals => GpuSpatialRelationPredicateWrapper::Equals,
+            GpuSpatialRelationPredicate::Disjoint => GpuSpatialRelationPredicateWrapper::Disjoint,
+            GpuSpatialRelationPredicate::Touches => GpuSpatialRelationPredicateWrapper::Touches,
+            GpuSpatialRelationPredicate::Contains => GpuSpatialRelationPredicateWrapper::Contains,
+            GpuSpatialRelationPredicate::Covers => GpuSpatialRelationPredicateWrapper::Covers,
+            GpuSpatialRelationPredicate::Intersects => {
+                GpuSpatialRelationPredicateWrapper::Intersects
+            }
+            GpuSpatialRelationPredicate::Within => GpuSpatialRelationPredicateWrapper::Within,
+            GpuSpatialRelationPredicate::CoveredBy => GpuSpatialRelationPredicateWrapper::CoveredBy,
+        }
+    }
+}
+
+/// Global shared GpuSpatialRuntime. Building an instance is expensive, so we share it across all GpuSpatial instances.
+#[cfg(gpu_available)]
+static GLOBAL_GPUSPATIAL_RUNTIME: Mutex<Option<Arc<Mutex<GpuSpatialRuntimeWrapper>>>> =
+    Mutex::new(None);
 /// High-level wrapper for GPU spatial operations
-pub struct GpuSpatialContext {
+pub struct GpuSpatial {
+    #[cfg(gpu_available)]
+    runtime: Option<Arc<Mutex<GpuSpatialRuntimeWrapper>>>,
     #[cfg(gpu_available)]
-    joiner: Option<GpuSpatialJoinerWrapper>,
+    index: Option<GpuSpatialIndexFloat2DWrapper>,
     #[cfg(gpu_available)]
-    context: Option<GpuSpatialJoinerContext>,
-    initialized: bool,
+    refiner: Option<GpuSpatialRefinerWrapper>,
+}
+
+pub struct GpuSpatialOptions {
+    pub cuda_use_memory_pool: bool,
+    pub cuda_memory_pool_init_percent: i32,
+    pub concurrency: u32,
+    pub device_id: i32,
+    pub compress_bvh: bool,
+    pub pipeline_batches: u32,
 }
 
-impl GpuSpatialContext {
+impl GpuSpatial {
     pub fn new() -> Result<Self> {
         #[cfg(not(gpu_available))]
         {
@@ -77,197 +152,480 @@ impl GpuSpatialContext {
         #[cfg(gpu_available)]
         {
             Ok(Self {
-                joiner: None,
-                context: None,
-                initialized: false,
+                runtime: None,
+                index: None,
+                refiner: None,
             })
         }
     }
 
-    pub fn init(&mut self) -> Result<()> {
+    pub fn init(&mut self, options: GpuSpatialOptions) -> Result<()> {
         #[cfg(not(gpu_available))]
         {
+            let _ = options;
             Err(GpuSpatialError::GpuNotAvailable)
         }
 
         #[cfg(gpu_available)]
         {
-            let mut joiner = GpuSpatialJoinerWrapper::new();
-
             // Get PTX path from OUT_DIR
-            let out_path = std::path::PathBuf::from(env!("OUT_DIR"));
-            let ptx_root = out_path.join("share/gpuspatial/shaders");
-            let ptx_root_str = ptx_root
-                .to_str()
-                .ok_or_else(|| GpuSpatialError::Init("Invalid PTX path".to_string()))?;
-
-            // Initialize with concurrency of 1 for now
-            joiner.init(1, ptx_root_str)?;
-
-            // Create context
-            let mut ctx = GpuSpatialJoinerContext {
-                last_error: std::ptr::null(),
-                private_data: std::ptr::null_mut(),
-                build_indices: std::ptr::null_mut(),
-                stream_indices: std::ptr::null_mut(),
-            };
-            joiner.create_context(&mut ctx);
+            // Acquire the lock for the global shared runtime
+            let mut global_runtime_guard = GLOBAL_GPUSPATIAL_RUNTIME.lock().unwrap();
+
+            // Initialize the global runtime if it hasn't been initialized yet
+            if global_runtime_guard.is_none() {
+                // Get PTX path from OUT_DIR
+                let out_path = std::path::PathBuf::from(env!("OUT_DIR"));
+                let ptx_root = out_path.join("share/gpuspatial/shaders");
+                let ptx_root_str = ptx_root
+                    .to_str()
+                    .ok_or_else(|| GpuSpatialError::Init("Invalid PTX path".to_string()))?;
+
+                let runtime = GpuSpatialRuntimeWrapper::try_new(
+                    options.device_id,
+                    ptx_root_str,
+                    options.cuda_use_memory_pool,
+                    options.cuda_memory_pool_init_percent,
+                )?;
+                *global_runtime_guard = Some(Arc::new(Mutex::new(runtime)));
+            }
+
+            // Get a clone of the Arc to the shared runtime
+            // safe to unwrap here because we just ensured it is Some
+            let runtime_ref = global_runtime_guard.as_ref().unwrap().clone();
+            // Assign to self
+            self.runtime = Some(runtime_ref);
+
+            let index = GpuSpatialIndexFloat2DWrapper::try_new(
+                self.runtime.as_ref().unwrap(),
+                options.concurrency,
+            )?;
+
+            self.index = Some(index);
+
+            let refiner = GpuSpatialRefinerWrapper::try_new(
+                self.runtime.as_ref().unwrap(),
+                options.concurrency,
+                options.compress_bvh,
+                options.pipeline_batches,
+            )?;
+            self.refiner = Some(refiner);
 
-            self.joiner = Some(joiner);
-            self.context = Some(ctx);
-            self.initialized = true;
             Ok(())
         }
     }
 
-    #[cfg(gpu_available)]
-    pub fn get_joiner_mut(&mut self) -> Option<&mut GpuSpatialJoinerWrapper> {
-        self.joiner.as_mut()
+    pub fn is_gpu_available() -> bool {
+        #[cfg(not(gpu_available))]
+        {
+            false
+        }
+        #[cfg(gpu_available)]
+        {
+            let nvml = match Nvml::init() {
+                Ok(instance) => instance,
+                Err(_) => return false,
+            };
+
+            // Check if the device count is greater than zero
+            match nvml.device_count() {
+                Ok(count) => count > 0,
+                Err(_) => false,
+            }
+        }
     }
 
-    #[cfg(gpu_available)]
-    pub fn get_context_mut(&mut self) -> Option<&mut GpuSpatialJoinerContext> {
-        self.context.as_mut()
+    /// Clear previous build data
+    pub fn index_clear(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let index = self
+                .index
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU index is not available".into()))?;
+
+            // Clear previous build data
+            index.clear();
+            Ok(())
+        }
     }
 
-    pub fn is_initialized(&self) -> bool {
-        self.initialized
+    pub fn index_push_build(&mut self, rects: &[Rect<f32>]) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = rects;
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let index = self
+                .index
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU index not available".into()))?;
+
+            unsafe { index.push_build(rects.as_ptr() as *const f32, rects.len() as u32) }
+        }
+    }
+
+    pub fn index_finish_building(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        return Err(GpuSpatialError::GpuNotAvailable);
+
+        #[cfg(gpu_available)]
+        self.index
+            .as_mut()
+            .ok_or_else(|| GpuSpatialError::Init("GPU index not available".into()))?
+            .finish_building()
     }
 
-    /// Perform spatial join between two geometry arrays
-    pub fn spatial_join(
-        &mut self,
-        left_geom: arrow_array::ArrayRef,
-        right_geom: arrow_array::ArrayRef,
-        predicate: SpatialPredicate,
-    ) -> Result<(Vec<u32>, Vec<u32>)> {
+    pub fn probe(&self, rects: &[Rect<f32>]) -> Result<(Vec<u32>, Vec<u32>)> {
         #[cfg(not(gpu_available))]
         {
-            let _ = (left_geom, right_geom, predicate);
+            let _ = rects;
             Err(GpuSpatialError::GpuNotAvailable)
         }
 
         #[cfg(gpu_available)]
         {
-            if !self.initialized {
-                return Err(GpuSpatialError::Init("Context not initialized".into()));
-            }
+            let index = self
+                .index
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU index not available".into()))?;
 
-            let joiner = self
-                .joiner
-                .as_mut()
-                .ok_or_else(|| GpuSpatialError::Init("GPU joiner not available".into()))?;
+            let mut ctx = SedonaSpatialIndexContext {
+                private_data: std::ptr::null_mut(),
+            };
+            index.create_context(&mut ctx);
 
-            // Clear previous build data
-            joiner.clear();
-
-            // Push build data (left side)
-            log::info!(
-                "DEBUG: Pushing {} geometries to GPU (build side)",
-                left_geom.len()
-            );
-            log::info!("DEBUG: Left array data type: {:?}", left_geom.data_type());
-            if let Some(binary_arr) = left_geom
-                .as_any()
-                .downcast_ref::<arrow_array::BinaryArray>()
-            {
-                log::info!("DEBUG: Left binary array has {} values", binary_arr.len());
-                if binary_arr.len() > 0 {
-                    let first_wkb = binary_arr.value(0);
-                    log::info!(
-                        "DEBUG: First left WKB length: {}, first bytes: {:?}",
-                        first_wkb.len(),
-                        &first_wkb[..8.min(first_wkb.len())]
-                    );
+            let result = (|| -> Result<(Vec<u32>, Vec<u32>)> {
+                unsafe {
+                    // If this fails, it returns Err from the *closure*, not the function
+                    index.probe(&mut ctx, rects.as_ptr() as *const f32, rects.len() as u32)?;
                 }
-            }
 
-            joiner.push_build(&left_geom, 0, left_geom.len() as i64)?;
-            joiner.finish_building()?;
+                // Copy results
+                let build_indices = index.get_build_indices_buffer(&mut ctx).to_vec();
+                let probe_indices = index.get_probe_indices_buffer(&mut ctx).to_vec();
 
-            // Recreate context after building (required by libgpuspatial)
-            let mut new_context = libgpuspatial_glue_bindgen::GpuSpatialJoinerContext {
-                last_error: std::ptr::null(),
-                private_data: std::ptr::null_mut(),
-                build_indices: std::ptr::null_mut(),
-                stream_indices: std::ptr::null_mut(),
-            };
-            joiner.create_context(&mut new_context);
-            self.context = Some(new_context);
-            let context = self.context.as_mut().unwrap();
-            // Push stream data (right side) and perform join
-            let gpu_predicate = predicate.into();
-            joiner.push_stream(
-                context,
-                &right_geom,
-                0,
-                right_geom.len() as i64,
-                gpu_predicate,
-                0, // array_index_offset
-            )?;
+                Ok((build_indices, probe_indices))
+            })();
 
-            // Get results
-            let build_indices = joiner.get_build_indices_buffer(context).to_vec();
-            let stream_indices = joiner.get_stream_indices_buffer(context).to_vec();
+            index.destroy_context(&mut ctx);
 
-            Ok((build_indices, stream_indices))
+            result
         }
     }
-}
 
-/// Spatial predicates for GPU operations
-#[repr(u32)]
-#[derive(Debug, PartialEq, Copy, Clone)]
-pub enum SpatialPredicate {
-    Equals = 0,
-    Disjoint = 1,
-    Touches = 2,
-    Contains = 3,
-    Covers = 4,
-    Intersects = 5,
-    Within = 6,
-    CoveredBy = 7,
-}
+    pub fn refiner_clear(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner is not available".into()))?;
 
-#[cfg(gpu_available)]
-impl From<SpatialPredicate> for GpuSpatialPredicateWrapper {
-    fn from(pred: SpatialPredicate) -> Self {
-        match pred {
-            SpatialPredicate::Equals => GpuSpatialPredicateWrapper::Equals,
-            SpatialPredicate::Disjoint => GpuSpatialPredicateWrapper::Disjoint,
-            SpatialPredicate::Touches => GpuSpatialPredicateWrapper::Touches,
-            SpatialPredicate::Contains => GpuSpatialPredicateWrapper::Contains,
-            SpatialPredicate::Covers => GpuSpatialPredicateWrapper::Covers,
-            SpatialPredicate::Intersects => GpuSpatialPredicateWrapper::Intersects,
-            SpatialPredicate::Within => GpuSpatialPredicateWrapper::Within,
-            SpatialPredicate::CoveredBy => GpuSpatialPredicateWrapper::CoveredBy,
+            // Clear previous build data
+            refiner.clear();
+            Ok(())
         }
     }
-}
 
-// Cleanup implementation
-impl Drop for GpuSpatialContext {
-    fn drop(&mut self) {
+    pub fn refiner_push_build(&mut self, array: &arrow_array::ArrayRef) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = array;
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
         #[cfg(gpu_available)]
         {
-            if let (Some(mut joiner), Some(mut ctx)) = (self.joiner.take(), self.context.take()) {
-                joiner.destroy_context(&mut ctx);
-                joiner.release();
-            }
+            let refiner = self
+                .refiner
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.push_build(array)
+        }
+    }
+
+    pub fn refiner_finish_building(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.finish_building()
+        }
+    }
+
+    pub fn refine_loaded(
+        &self,
+        probe_array: &arrow_array::ArrayRef,
+        predicate: GpuSpatialRelationPredicate,
+        build_indices: &mut Vec<u32>,
+        probe_indices: &mut Vec<u32>,
+    ) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = (probe_array, predicate, build_indices, probe_indices);
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.refine_loaded(
+                probe_array,
+                GpuSpatialRelationPredicateWrapper::from(predicate),
+                build_indices,
+                probe_indices,
+            )
+        }
+    }
+
+    pub fn refine(
+        &self,
+        array1: &arrow_array::ArrayRef,
+        array2: &arrow_array::ArrayRef,
+        predicate: GpuSpatialRelationPredicate,
+        indices1: &mut Vec<u32>,
+        indices2: &mut Vec<u32>,
+    ) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = (array1, array2, predicate, indices1, indices2);
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.refine(
+                array1,
+                array2,
+                GpuSpatialRelationPredicateWrapper::from(predicate),
+                indices1,
+                indices2,
+            )
         }
     }
 }
 
+#[cfg(gpu_available)]
 #[cfg(test)]
 mod tests {
     use super::*;
+    use geo::{BoundingRect, Intersects, Point, Polygon};
+    use sedona_expr::scalar_udf::SedonaScalarUDF;
+    use sedona_geos::register::scalar_kernels;
+    use sedona_schema::crs::lnglat;
+    use sedona_schema::datatypes::{Edges, SedonaType, WKB_GEOMETRY};
+    use sedona_testing::create::create_array_storage;
+    use sedona_testing::testers::ScalarUdfTester;
+    use wkt::TryFromWkt;
+
+    pub fn find_intersection_pairs(
+        vec_a: &[Rect<f32>],
+        vec_b: &[Rect<f32>],
+    ) -> (Vec<u32>, Vec<u32>) {
+        let mut ids_a = Vec::new();
+        let mut ids_b = Vec::new();
+
+        // Iterate through A with index 'i'
+        for (i, rect_a) in vec_a.iter().enumerate() {
+            // Only proceed if 'a' exists
+            // Iterate through B with index 'j'
+            for (j, rect_b) in vec_b.iter().enumerate() {
+                // Check if 'b' exists and intersects 'a'
+                if rect_a.intersects(rect_b) {
+                    ids_a.push(i as u32);
+                    ids_b.push(j as u32);
+                }
+            }
+        }
 
+        (ids_a, ids_b)
+    }
     #[test]
-    fn test_context_creation() {
-        let ctx = GpuSpatialContext::new();
-        #[cfg(gpu_available)]
-        assert!(ctx.is_ok());
-        #[cfg(not(gpu_available))]
-        assert!(ctx.is_err());
+    fn test_spatial_index() {
+        let mut gs = GpuSpatial::new().unwrap();
+        let options = GpuSpatialOptions {
+            concurrency: 1,
+            device_id: 0,
+            compress_bvh: false,
+            pipeline_batches: 1,
+            cuda_use_memory_pool: true,
+            cuda_memory_pool_init_percent: 10,
+        };
+        gs.init(options).expect("Failed to initialize GpuSpatial");
+
+        let polygon_values =  &[
+            Some("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"),
+            Some("POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"),
+            Some("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 3 3, 2 3, 2 2), (6 6, 8 6, 8 8, 6 8, 6 6))"),
+            Some("POLYGON ((30 0, 60 20, 50 50, 10 50, 0 20, 30 0), (20 30, 25 40, 15 40, 20 30), (30 30, 35 40, 25 40, 30 30), (40 30, 45 40, 35 40, 40 30))"),
+            Some("POLYGON ((40 0, 50 30, 80 20, 90 70, 60 90, 30 80, 20 40, 40 0), (50 20, 65 30, 60 50, 45 40, 50 20), (30 60, 50 70, 45 80, 30 60))"),
+        ];
+        let rects: Vec<Rect<f32>> = polygon_values
+            .iter()
+            .filter_map(|opt_wkt| {
+                let wkt_str = opt_wkt.as_ref()?;
+                let polygon: Polygon<f32> = Polygon::try_from_wkt_str(wkt_str).ok()?;
+
+                polygon.bounding_rect()
+            })
+            .collect();
+        gs.index_push_build(&rects)
+            .expect("Failed to push build data");
+        gs.index_finish_building()
+            .expect("Failed to finish building");
+        let point_values = &[
+            Some("POINT (30 20)"),
+            Some("POINT (20 20)"),
+            Some("POINT (1 1)"),
+            Some("POINT (70 70)"),
+            Some("POINT (55 35)"),
+        ];
+        let points: Vec<Rect<f32>> = point_values
+            .iter()
+            .map(|opt_wkt| -> Rect<f32> {
+                let wkt_str = opt_wkt.unwrap();
+                let point: Point<f32> = Point::try_from_wkt_str(wkt_str).ok().unwrap();
+                point.bounding_rect()
+            })
+            .collect();
+        let (mut build_indices, mut probe_indices) = gs.probe(&points).unwrap();
+        build_indices.sort();
+        probe_indices.sort();
+
+        let (mut ans_build_indices, mut ans_probe_indices) =
+            find_intersection_pairs(&rects, &points);
+
+        ans_build_indices.sort();
+        ans_probe_indices.sort();
+
+        assert_eq!(build_indices, ans_build_indices);
+        assert_eq!(probe_indices, ans_probe_indices);
+    }
+
+    #[test]
+    fn test_spatial_refiner() {
+        let mut gs = GpuSpatial::new().unwrap();
+        let options = GpuSpatialOptions {
+            concurrency: 1,
+            device_id: 0,
+            compress_bvh: false,
+            pipeline_batches: 1,
+            cuda_use_memory_pool: true,
+            cuda_memory_pool_init_percent: 10,
+        };
+        gs.init(options).expect("Failed to initialize GpuSpatial");
+
+        let polygon_values =  &[
+            Some("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"),
+            Some("POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"),
+            Some("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 3 3, 2 3, 2 2), (6 6, 8 6, 8 8, 6 8, 6 6))"),
+            Some("POLYGON ((30 0, 60 20, 50 50, 10 50, 0 20, 30 0), (20 30, 25 40, 15 40, 20 30), (30 30, 35 40, 25 40, 30 30), (40 30, 45 40, 35 40, 40 30))"),
+            Some("POLYGON ((40 0, 50 30, 80 20, 90 70, 60 90, 30 80, 20 40, 40 0), (50 20, 65 30, 60 50, 45 40, 50 20), (30 60, 50 70, 45 80, 30 60))"),
+        ];
+        let polygons = create_array_storage(polygon_values, &WKB_GEOMETRY);
+
+        let rects: Vec<Rect<f32>> = polygon_values
+            .iter()
+            .map(|opt_wkt| -> Rect<f32> {
+                let wkt_str = opt_wkt.unwrap();
+                let polygon: Polygon<f32> = Polygon::try_from_wkt_str(wkt_str).ok().unwrap();
+                polygon.bounding_rect().unwrap()
+            })
+            .collect();
+        gs.index_push_build(&rects)
+            .expect("Failed to push build data");
+        gs.index_finish_building()
+            .expect("Failed to finish building");
+        let point_values = &[
+            Some("POINT (30 20)"),
+            Some("POINT (20 20)"),
+            Some("POINT (1 1)"),
+            Some("POINT (70 70)"),
+            Some("POINT (55 35)"),
+        ];
+        let points = create_array_storage(point_values, &WKB_GEOMETRY);
+        let point_rects: Vec<Rect<f32>> = point_values
+            .iter()
+            .map(|wkt| -> Rect<f32> {
+                let wkt_str = wkt.unwrap();
+
+                let point: Point<f32> = Point::try_from_wkt_str(wkt_str).unwrap();
+
+                point.bounding_rect()
+            })
+            .collect();
+        let (mut build_indices, mut probe_indices) = gs.probe(&point_rects).unwrap();
+
+        gs.refine(
+            &polygons,
+            &points,
+            GpuSpatialRelationPredicate::Intersects,
+            &mut build_indices,
+            &mut probe_indices,
+        )
+        .expect("Failed to refine results");
+
+        build_indices.sort();
+        probe_indices.sort();
+
+        let kernels = scalar_kernels();
+
+        // Iterate through the vector and find the one named "st_intersects"
+        let st_intersects = kernels
+            .into_iter()
+            .find(|(name, _)| *name == "st_intersects")
+            .map(|(_, kernel_ref)| kernel_ref)
+            .unwrap();
+
+        let sedona_type = SedonaType::Wkb(Edges::Planar, lnglat());
+        let udf = SedonaScalarUDF::from_impl("st_intersects", st_intersects);
+        let tester =
+            ScalarUdfTester::new(udf.into(), vec![sedona_type.clone(), sedona_type.clone()]);
+
+        let mut ans_build_indices: Vec<u32> = Vec::new();
+        let mut ans_probe_indices: Vec<u32> = Vec::new();
+
+        for (poly_index, poly) in polygon_values.iter().enumerate() {
+            for (point_index, point) in point_values.iter().enumerate() {
+                let result = tester
+                    .invoke_scalar_scalar(poly.unwrap(), point.unwrap())
+                    .unwrap();
+                if result == true.into() {
+                    ans_build_indices.push(poly_index as u32);
+                    ans_probe_indices.push(point_index as u32);
+                }
+            }
+        }
+
+        ans_build_indices.sort();
+        ans_probe_indices.sort();
+
+        assert_eq!(build_indices, ans_build_indices);
+        assert_eq!(probe_indices, ans_probe_indices);
     }
 }
diff --git a/c/sedona-libgpuspatial/src/libgpuspatial.rs b/c/sedona-libgpuspatial/src/libgpuspatial.rs
index 414b92e09..5723ac35c 100644
--- a/c/sedona-libgpuspatial/src/libgpuspatial.rs
+++ b/c/sedona-libgpuspatial/src/libgpuspatial.rs
@@ -17,106 +17,143 @@
 
 use crate::error::GpuSpatialError;
 use crate::libgpuspatial_glue_bindgen::*;
-use arrow_array::{ffi::FFI_ArrowArray, ArrayRef};
+use arrow_array::{ffi::FFI_ArrowArray, Array, ArrayRef};
+use arrow_schema::ffi::FFI_ArrowSchema;
 use std::convert::TryFrom;
 use std::ffi::CString;
 use std::mem::transmute;
-use std::os::raw::{c_uint, c_void};
+use std::os::raw::c_uint;
+use std::sync::{Arc, Mutex};
 
-pub struct GpuSpatialJoinerWrapper {
-    joiner: GpuSpatialJoiner,
+pub struct GpuSpatialRuntimeWrapper {
+    runtime: GpuSpatialRuntime,
 }
 
-#[repr(u32)]
-#[derive(Debug, PartialEq, Copy, Clone)]
-pub enum GpuSpatialPredicateWrapper {
-    Equals = 0,
-    Disjoint = 1,
-    Touches = 2,
-    Contains = 3,
-    Covers = 4,
-    Intersects = 5,
-    Within = 6,
-    CoveredBy = 7,
-}
+impl GpuSpatialRuntimeWrapper {
+    /// # Initializes the GpuSpatialRuntime
+    /// This function should only be called once per engine instance.
+    /// # Arguments
+    /// * `device_id` - The GPU device ID to use.
+    /// * `ptx_root` - The root directory for PTX files.
+    pub fn try_new(
+        device_id: i32,
+        ptx_root: &str,
+        use_cuda_memory_pool: bool,
+        cuda_memory_pool_init_precent: i32,
+    ) -> Result<GpuSpatialRuntimeWrapper, GpuSpatialError> {
+        let mut runtime = GpuSpatialRuntime {
+            init: None,
+            release: None,
+            get_last_error: None,
+            private_data: std::ptr::null_mut(),
+        };
 
-impl TryFrom<c_uint> for GpuSpatialPredicateWrapper {
-    type Error = &'static str;
+        unsafe {
+            // Set function pointers to the C functions
+            GpuSpatialRuntimeCreate(&mut runtime);
+        }
 
-    fn try_from(v: c_uint) -> Result<Self, Self::Error> {
-        match v {
-            0 => Ok(GpuSpatialPredicateWrapper::Equals),
-            1 => Ok(GpuSpatialPredicateWrapper::Disjoint),
-            2 => Ok(GpuSpatialPredicateWrapper::Touches),
-            3 => Ok(GpuSpatialPredicateWrapper::Contains),
-            4 => Ok(GpuSpatialPredicateWrapper::Covers),
-            5 => Ok(GpuSpatialPredicateWrapper::Intersects),
-            6 => Ok(GpuSpatialPredicateWrapper::Within),
-            7 => Ok(GpuSpatialPredicateWrapper::CoveredBy),
-            _ => Err("Invalid GpuSpatialPredicate value"),
+        if let Some(init_fn) = runtime.init {
+            let c_ptx_root = CString::new(ptx_root).expect("CString::new failed");
+
+            let mut config = GpuSpatialRuntimeConfig {
+                device_id,
+                ptx_root: c_ptx_root.as_ptr(),
+                use_cuda_memory_pool,
+                cuda_memory_pool_init_precent,
+            };
+
+            // This is an unsafe call because it's calling a C function from the bindings.
+            unsafe {
+                if init_fn(&runtime as *const _ as *mut _, &mut config) != 0 {
+                    let error_message =
+                        runtime.get_last_error.unwrap()(&runtime as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    return Err(GpuSpatialError::Init(error_string));
+                }
+            }
         }
-    }
-}
 
-impl Default for GpuSpatialJoinerWrapper {
-    fn default() -> Self {
-        Self::new()
+        Ok(GpuSpatialRuntimeWrapper { runtime })
     }
 }
 
-impl GpuSpatialJoinerWrapper {
-    pub fn new() -> Self {
-        GpuSpatialJoinerWrapper {
-            joiner: GpuSpatialJoiner {
+impl Default for GpuSpatialRuntimeWrapper {
+    fn default() -> Self {
+        GpuSpatialRuntimeWrapper {
+            runtime: GpuSpatialRuntime {
                 init: None,
-                clear: None,
-                create_context: None,
-                destroy_context: None,
-                push_build: None,
-                finish_building: None,
-                push_stream: None,
-                get_build_indices_buffer: None,
-                get_stream_indices_buffer: None,
                 release: None,
+                get_last_error: None,
                 private_data: std::ptr::null_mut(),
-                last_error: std::ptr::null(),
             },
         }
     }
+}
 
+impl Drop for GpuSpatialRuntimeWrapper {
+    fn drop(&mut self) {
+        // Call the release function if it exists
+        if let Some(release_fn) = self.runtime.release {
+            unsafe {
+                release_fn(&mut self.runtime as *mut _);
+            }
+        }
+    }
+}
+
+pub struct GpuSpatialIndexFloat2DWrapper {
+    index: SedonaFloatIndex2D,
+    _runtime: Arc<Mutex<GpuSpatialRuntimeWrapper>>, // Keep a reference to the RT engine to ensure it lives as long as the index
+}
+
+impl GpuSpatialIndexFloat2DWrapper {
     /// # Initializes the GpuSpatialJoiner
     /// This function should only be called once per joiner instance.
     ///
     /// # Arguments
+    /// * `runtime` - The GPUSpatial runtime to use for GPU operations.
     /// * `concurrency` - How many threads will call the joiner concurrently.
-    /// * `ptx_root` - The root directory for PTX files.
-    pub fn init(&mut self, concurrency: u32, ptx_root: &str) -> Result<(), GpuSpatialError> {
-        let joiner_ptr: *mut GpuSpatialJoiner = &mut self.joiner;
+    pub fn try_new(
+        runtime: &Arc<Mutex<GpuSpatialRuntimeWrapper>>,
+        concurrency: u32,
+    ) -> Result<Self, GpuSpatialError> {
+        let mut index = SedonaFloatIndex2D {
+            clear: None,
+            create_context: None,
+            destroy_context: None,
+            push_build: None,
+            finish_building: None,
+            probe: None,
+            get_build_indices_buffer: None,
+            get_probe_indices_buffer: None,
+            get_last_error: None,
+            context_get_last_error: None,
+            release: None,
+            private_data: std::ptr::null_mut(),
+        };
+        let mut engine_guard = runtime
+            .lock()
+            .map_err(|_| GpuSpatialError::Init("Failed to acquire mutex lock".to_string()))?;
+        let config = GpuSpatialIndexConfig {
+            runtime: &mut engine_guard.runtime,
+            concurrency,
+        };
 
         unsafe {
             // Set function pointers to the C functions
-            GpuSpatialJoinerCreate(joiner_ptr);
-        }
-
-        if let Some(init_fn) = self.joiner.init {
-            let c_ptx_root = CString::new(ptx_root).expect("CString::new failed");
-
-            let mut config = GpuSpatialJoinerConfig {
-                concurrency,
-                ptx_root: c_ptx_root.as_ptr(),
-            };
-
-            // This is an unsafe call because it's calling a C function from the bindings.
-            unsafe {
-                if init_fn(&self.joiner as *const _ as *mut _, &mut config) != 0 {
-                    let error_message = self.joiner.last_error;
-                    let c_str = std::ffi::CStr::from_ptr(error_message);
-                    let error_string = c_str.to_string_lossy().into_owned();
-                    return Err(GpuSpatialError::Init(error_string));
-                }
+            if GpuSpatialIndexFloat2DCreate(&mut index, &config) != 0 {
+                let error_message = index.get_last_error.unwrap()(&runtime as *const _ as *mut _);
+                let c_str = std::ffi::CStr::from_ptr(error_message);
+                let error_string = c_str.to_string_lossy().into_owned();
+                return Err(GpuSpatialError::Init(error_string));
             }
         }
-        Ok(())
+        Ok(GpuSpatialIndexFloat2DWrapper {
+            index,
+            _runtime: runtime.clone(),
+        })
     }
 
     /// # Clears the GpuSpatialJoiner
@@ -126,69 +163,43 @@ impl GpuSpatialJoinerWrapper {
     /// instead of building a new one because creating a new joiner is expensive.
     /// **This method is not thread-safe and should be called from a single thread.**
     pub fn clear(&mut self) {
-        if let Some(clear_fn) = self.joiner.clear {
+        if let Some(clear_fn) = self.index.clear {
             unsafe {
-                clear_fn(&mut self.joiner as *mut _);
+                clear_fn(&mut self.index as *mut _);
             }
         }
     }
 
-    /// # Pushes an array of WKBs to the build side of the joiner
+    /// # Pushes an array of rectangles to the build side of the joiner
     /// This function can be called multiple times to push multiple arrays.
-    /// The joiner will internally parse the WKBs and build a spatial index.
+    /// The joiner will internally parse the rectangles and build a spatial index.
     /// After pushing all build data, you must call `finish_building()` to build the
     /// spatial index.
     /// **This method is not thread-safe and should be called from a single thread.**
     /// # Arguments
-    /// * `array` - The array of WKBs to push.
-    /// * `offset` - The offset of the array to push.
-    /// * `length` - The length of the array to push.
-    pub fn push_build(
+    /// * `buf` - The array pointer to the rectangles to push.
+    /// * `n_rects` - The number of rectangles in the array.
+    /// # Safety
+    /// This function is unsafe because it takes a raw pointer to the rectangles.
+    ///
+    pub unsafe fn push_build(
         &mut self,
-        array: &ArrayRef,
-        offset: i64,
-        length: i64,
+        buf: *const f32,
+        n_rects: u32,
     ) -> Result<(), GpuSpatialError> {
-        log::info!(
-            "DEBUG FFI: push_build called with offset={}, length={}",
-            offset,
-            length
-        );
-        log::info!(
-            "DEBUG FFI: Array length={}, null_count={}",
-            array.len(),
-            array.null_count()
-        );
-
-        // 1. Convert the single ArrayRef to its FFI representation
-        let (ffi_array, _) = arrow_array::ffi::to_ffi(&array.to_data())?;
-
-        log::info!("DEBUG FFI: FFI conversion successful");
-        log::info!("DEBUG FFI: FFI array null_count={}", ffi_array.null_count());
-
-        // 2. Get the raw pointer to the FFI_ArrowArray struct
-        // let arrow_ptr = &mut ffi_array as *mut FFI_ArrowArray as *mut ArrowArray;
+        log::debug!("DEBUG FFI: push_build called with length={}", n_rects);
 
-        if let Some(push_build_fn) = self.joiner.push_build {
+        if let Some(push_build_fn) = self.index.push_build {
             unsafe {
-                let ffi_array_ptr: *const ArrowArray =
-                    transmute(&ffi_array as *const FFI_ArrowArray);
-                log::info!("DEBUG FFI: Calling C++ push_build function");
-                if push_build_fn(
-                    &mut self.joiner as *mut _,
-                    std::ptr::null_mut(), // schema is unused currently
-                    ffi_array_ptr as *mut _,
-                    offset,
-                    length,
-                ) != 0
-                {
-                    let error_message = self.joiner.last_error;
+                if push_build_fn(&mut self.index as *mut _, buf, n_rects) != 0 {
+                    let error_message =
+                        self.index.get_last_error.unwrap()(&mut self.index as *mut _);
                     let c_str = std::ffi::CStr::from_ptr(error_message);
                     let error_string = c_str.to_string_lossy().into_owned();
                     log::error!("DEBUG FFI: push_build failed: {}", error_string);
                     return Err(GpuSpatialError::PushBuild(error_string));
                 }
-                log::info!("DEBUG FFI: push_build C++ call succeeded");
+                log::debug!("DEBUG FFI: push_build C++ call succeeded");
             }
         }
         Ok(())
@@ -201,10 +212,11 @@ impl GpuSpatialJoinerWrapper {
     /// for spatial join operations.
     /// **This method is not thread-safe and should be called from a single thread.**
     pub fn finish_building(&mut self) -> Result<(), GpuSpatialError> {
-        if let Some(finish_building_fn) = self.joiner.finish_building {
+        if let Some(finish_building_fn) = self.index.finish_building {
             unsafe {
-                if finish_building_fn(&mut self.joiner as *mut _) != 0 {
-                    let error_message = self.joiner.last_error;
+                if finish_building_fn(&mut self.index as *mut _) != 0 {
+                    let error_message =
+                        self.index.get_last_error.unwrap()(&mut self.index as *mut _);
                     let c_str = std::ffi::CStr::from_ptr(error_message);
                     let error_string = c_str.to_string_lossy().into_owned();
                     return Err(GpuSpatialError::FinishBuild(error_string));
@@ -224,89 +236,73 @@ impl GpuSpatialJoinerWrapper {
     /// The context can be destroyed by calling the `destroy_context` function pointer in the `GpuSpatialJoiner` struct.
     /// The context should be destroyed before destroying the joiner.
     /// **This method is thread-safe.**
-    pub fn create_context(&mut self, ctx: &mut GpuSpatialJoinerContext) {
-        if let Some(create_context_fn) = self.joiner.create_context {
+    pub fn create_context(&self, ctx: &mut SedonaSpatialIndexContext) {
+        if let Some(create_context_fn) = self.index.create_context {
             unsafe {
-                create_context_fn(&mut self.joiner as *mut _, ctx as *mut _);
+                // Cast the shared reference to a raw pointer, then to a mutable raw pointer
+                create_context_fn(ctx as *mut _);
             }
         }
     }
 
-    pub fn destroy_context(&mut self, ctx: &mut GpuSpatialJoinerContext) {
-        if let Some(destroy_context_fn) = self.joiner.destroy_context {
+    pub fn destroy_context(&self, ctx: &mut SedonaSpatialIndexContext) {
+        if let Some(destroy_context_fn) = self.index.destroy_context {
             unsafe {
                 destroy_context_fn(ctx as *mut _);
             }
         }
     }
 
-    pub fn push_stream(
-        &mut self,
-        ctx: &mut GpuSpatialJoinerContext,
-        array: &ArrayRef,
-        offset: i64,
-        length: i64,
-        predicate: GpuSpatialPredicateWrapper,
-        array_index_offset: i32,
+    /// # Probes an array of rectangles against the built spatial index
+    /// This function probes an array of rectangles against the spatial index built
+    /// using `push_build()` and `finish_building()`. It finds all pairs of rectangles
+    /// that satisfy the spatial relation defined by the index.
+    /// The results are stored in the context passed to the function.
+    /// **This method is thread-safe if each thread uses its own context.**
+    /// # Arguments
+    /// * `ctx` - The context for the thread performing the spatial join.
+    /// * `buf` - A pointer to the array of rectangles to probe.
+    /// * `n_rects` - The number of rectangles in the array.
+    /// # Safety
+    /// This function is unsafe because it takes a raw pointer to the rectangles.
+    pub unsafe fn probe(
+        &self,
+        ctx: &mut SedonaSpatialIndexContext,
+        buf: *const f32,
+        n_rects: u32,
     ) -> Result<(), GpuSpatialError> {
-        log::info!(
-            "DEBUG FFI: push_stream called with offset={}, length={}, predicate={:?}",
-            offset,
-            length,
-            predicate
-        );
-        log::info!(
-            "DEBUG FFI: Array length={}, null_count={}",
-            array.len(),
-            array.null_count()
-        );
-
-        // 1. Convert the single ArrayRef to its FFI representation
-        let (ffi_array, _) = arrow_array::ffi::to_ffi(&array.to_data())?;
-
-        log::info!("DEBUG FFI: FFI conversion successful");
-        log::info!("DEBUG FFI: FFI array null_count={}", ffi_array.null_count());
-
-        // 2. Get the raw pointer to the FFI_ArrowArray struct
-        // let arrow_ptr = &mut ffi_array as *mut FFI_ArrowArray as *mut ArrowArray;
+        log::debug!("DEBUG FFI: probe called with length={}", n_rects);
 
-        if let Some(push_stream_fn) = self.joiner.push_stream {
+        if let Some(probe_fn) = self.index.probe {
             unsafe {
-                let ffi_array_ptr: *const ArrowArray =
-                    transmute(&ffi_array as *const FFI_ArrowArray);
-                log::info!("DEBUG FFI: Calling C++ push_stream function");
-                if push_stream_fn(
-                    &mut self.joiner as *mut _,
+                if probe_fn(
+                    &self.index as *const _ as *mut _,
                     ctx as *mut _,
-                    std::ptr::null_mut(), // schema is unused currently
-                    ffi_array_ptr as *mut _,
-                    offset,
-                    length,
-                    predicate as c_uint,
-                    array_index_offset,
+                    buf,
+                    n_rects,
                 ) != 0
                 {
-                    let error_message = ctx.last_error;
+                    let error_message = self.index.context_get_last_error.unwrap()(ctx);
                     let c_str = std::ffi::CStr::from_ptr(error_message);
                     let error_string = c_str.to_string_lossy().into_owned();
-                    log::error!("DEBUG FFI: push_stream failed: {}", error_string);
-                    return Err(GpuSpatialError::PushStream(error_string));
+                    log::error!("DEBUG FFI: probe failed: {}", error_string);
+                    return Err(GpuSpatialError::Probe(error_string));
                 }
-                log::info!("DEBUG FFI: push_stream C++ call succeeded");
+                log::debug!("DEBUG FFI: probe C++ call succeeded");
             }
         }
         Ok(())
     }
 
-    pub fn get_build_indices_buffer(&self, ctx: &mut GpuSpatialJoinerContext) -> &[u32] {
-        if let Some(get_build_indices_buffer_fn) = self.joiner.get_build_indices_buffer {
-            let mut build_indices_ptr: *mut c_void = std::ptr::null_mut();
+    pub fn get_build_indices_buffer(&self, ctx: &mut SedonaSpatialIndexContext) -> &[u32] {
+        if let Some(get_build_indices_buffer_fn) = self.index.get_build_indices_buffer {
+            let mut build_indices_ptr: *mut u32 = std::ptr::null_mut();
             let mut build_indices_len: u32 = 0;
 
             unsafe {
                 get_build_indices_buffer_fn(
                     ctx as *mut _,
-                    &mut build_indices_ptr as *mut *mut c_void,
+                    &mut build_indices_ptr as *mut *mut u32,
                     &mut build_indices_len as *mut u32,
                 );
 
@@ -331,179 +327,387 @@ impl GpuSpatialJoinerWrapper {
         &[]
     }
 
-    pub fn get_stream_indices_buffer(&self, ctx: &mut GpuSpatialJoinerContext) -> &[u32] {
-        if let Some(get_stream_indices_buffer_fn) = self.joiner.get_stream_indices_buffer {
-            let mut stream_indices_ptr: *mut c_void = std::ptr::null_mut();
-            let mut stream_indices_len: u32 = 0;
+    pub fn get_probe_indices_buffer(&self, ctx: &mut SedonaSpatialIndexContext) -> &[u32] {
+        if let Some(get_probe_indices_buffer_fn) = self.index.get_probe_indices_buffer {
+            let mut probe_indices_ptr: *mut u32 = std::ptr::null_mut();
+            let mut probe_indices_len: u32 = 0;
 
             unsafe {
-                get_stream_indices_buffer_fn(
+                get_probe_indices_buffer_fn(
                     ctx as *mut _,
-                    &mut stream_indices_ptr as *mut *mut c_void,
-                    &mut stream_indices_len as *mut u32,
+                    &mut probe_indices_ptr as *mut *mut u32,
+                    &mut probe_indices_len as *mut u32,
                 );
 
                 // Check length first - empty vectors return empty slice
-                if stream_indices_len == 0 {
+                if probe_indices_len == 0 {
                     return &[];
                 }
 
                 // Validate pointer (should not be null if length > 0)
-                if stream_indices_ptr.is_null() {
+                if probe_indices_ptr.is_null() {
                     return &[];
                 }
 
                 // Convert the raw pointer to a slice. This is safe to do because
                 // we've validated the pointer is non-null and length is valid.
-                let typed_ptr = stream_indices_ptr as *const u32;
+                let typed_ptr = probe_indices_ptr as *const u32;
 
                 // Safety: We've checked ptr is non-null and len > 0
-                return std::slice::from_raw_parts(typed_ptr, stream_indices_len as usize);
+                return std::slice::from_raw_parts(typed_ptr, probe_indices_len as usize);
             }
         }
         &[]
     }
+}
 
-    pub fn release(&mut self) {
-        // Call the release function if it exists
-        if let Some(release_fn) = self.joiner.release {
-            unsafe {
-                release_fn(&mut self.joiner as *mut _);
-            }
+impl Default for GpuSpatialIndexFloat2DWrapper {
+    fn default() -> Self {
+        GpuSpatialIndexFloat2DWrapper {
+            index: SedonaFloatIndex2D {
+                clear: None,
+                create_context: None,
+                destroy_context: None,
+                push_build: None,
+                finish_building: None,
+                probe: None,
+                get_build_indices_buffer: None,
+                get_probe_indices_buffer: None,
+                get_last_error: None,
+                context_get_last_error: None,
+                release: None,
+                private_data: std::ptr::null_mut(),
+            },
+            _runtime: Arc::new(Mutex::new(GpuSpatialRuntimeWrapper::default())),
         }
     }
 }
 
-impl Drop for GpuSpatialJoinerWrapper {
+impl Drop for GpuSpatialIndexFloat2DWrapper {
     fn drop(&mut self) {
         // Call the release function if it exists
-        if let Some(release_fn) = self.joiner.release {
+        if let Some(release_fn) = self.index.release {
             unsafe {
-                release_fn(&mut self.joiner as *mut _);
+                release_fn(&mut self.index as *mut _);
             }
         }
     }
 }
 
-#[cfg(test)]
-mod test {
-    use super::*;
-    use sedona_expr::scalar_udf::SedonaScalarUDF;
-    use sedona_geos::register::scalar_kernels;
-    use sedona_schema::crs::lnglat;
-    use sedona_schema::datatypes::{Edges, SedonaType, WKB_GEOMETRY};
-    use sedona_testing::create::create_array_storage;
-    use sedona_testing::testers::ScalarUdfTester;
-    use std::env;
-    use std::path::PathBuf;
-
-    #[test]
-    fn test_gpu_joiner_end2end() {
-        let mut joiner = GpuSpatialJoinerWrapper::new();
-
-        let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-        let ptx_root = out_path.join("share/gpuspatial/shaders");
-
-        joiner
-            .init(
-                1,
-                ptx_root.to_str().expect("Failed to convert path to string"),
-            )
-            .expect("Failed to init GpuSpatialJoiner");
-
-        let polygon_values =  &[
-            Some("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"),
-            Some("POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"),
-            Some("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 3 3, 2 3, 2 2), (6 6, 8 6, 8 8, 6 8, 6 6))"),
-            Some("POLYGON ((30 0, 60 20, 50 50, 10 50, 0 20, 30 0), (20 30, 25 40, 15 40, 20 30), (30 30, 35 40, 25 40, 30 30), (40 30, 45 40, 35 40, 40 30))"),
-            Some("POLYGON ((40 0, 50 30, 80 20, 90 70, 60 90, 30 80, 20 40, 40 0), (50 20, 65 30, 60 50, 45 40, 50 20), (30 60, 50 70, 45 80, 30 60))"),
-        ];
-        let polygons = create_array_storage(polygon_values, &WKB_GEOMETRY);
-
-        // Let the gpusaptial joiner to parse WKBs and get building boxes
-        joiner
-            .push_build(&polygons, 0, polygons.len().try_into().unwrap())
-            .expect("Failed to push building");
-        // Build a spatial index for Build internally on GPU
-        joiner.finish_building().expect("Failed to finish building");
-
-        // Each thread that performs spatial joins should have its own context.
-        // The context is passed to PushStream calls to perform spatial joins.
-        let mut ctx = GpuSpatialJoinerContext {
-            last_error: std::ptr::null(),
+#[repr(u32)]
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum GpuSpatialRelationPredicateWrapper {
+    Equals = 0,
+    Disjoint = 1,
+    Touches = 2,
+    Contains = 3,
+    Covers = 4,
+    Intersects = 5,
+    Within = 6,
+    CoveredBy = 7,
+}
+
+impl TryFrom<c_uint> for GpuSpatialRelationPredicateWrapper {
+    type Error = &'static str;
+
+    fn try_from(v: c_uint) -> Result<Self, Self::Error> {
+        match v {
+            0 => Ok(GpuSpatialRelationPredicateWrapper::Equals),
+            1 => Ok(GpuSpatialRelationPredicateWrapper::Disjoint),
+            2 => Ok(GpuSpatialRelationPredicateWrapper::Touches),
+            3 => Ok(GpuSpatialRelationPredicateWrapper::Contains),
+            4 => Ok(GpuSpatialRelationPredicateWrapper::Covers),
+            5 => Ok(GpuSpatialRelationPredicateWrapper::Intersects),
+            6 => Ok(GpuSpatialRelationPredicateWrapper::Within),
+            7 => Ok(GpuSpatialRelationPredicateWrapper::CoveredBy),
+            _ => Err("Invalid GpuSpatialPredicate value"),
+        }
+    }
+}
+
+pub struct GpuSpatialRefinerWrapper {
+    refiner: SedonaSpatialRefiner,
+    _runtime: Arc<Mutex<GpuSpatialRuntimeWrapper>>, // Keep a reference to the RT engine to ensure it lives as long as the refiner
+}
+
+impl GpuSpatialRefinerWrapper {
+    /// # Initializes the GpuSpatialJoiner
+    /// This function should only be called once per joiner instance.
+    ///
+    /// # Arguments
+    /// * `concurrency` - How many threads will call the joiner concurrently.
+    /// * `ptx_root` - The root directory for PTX files.
+    pub fn try_new(
+        runtime: &Arc<Mutex<GpuSpatialRuntimeWrapper>>,
+        concurrency: u32,
+        compress_bvh: bool,
+        pipeline_batches: u32,
+    ) -> Result<Self, GpuSpatialError> {
+        let mut refiner = SedonaSpatialRefiner {
+            clear: None,
+            push_build: None,
+            finish_building: None,
+            refine_loaded: None,
+            refine: None,
+            get_last_error: None,
+            release: None,
             private_data: std::ptr::null_mut(),
-            build_indices: std::ptr::null_mut(),
-            stream_indices: std::ptr::null_mut(),
         };
+        let mut engine_guard = runtime
+            .lock()
+            .map_err(|_| GpuSpatialError::Init("Failed to acquire mutex lock".to_string()))?;
+        let config = GpuSpatialRefinerConfig {
+            runtime: &mut engine_guard.runtime,
+            concurrency,
+            compress_bvh,
+            pipeline_batches,
+        };
+        unsafe {
+            // Set function pointers to the C functions
+            if GpuSpatialRefinerCreate(&mut refiner, &config) != 0 {
+                let error_message = refiner.get_last_error.unwrap()(&refiner as *const _ as *mut _);
+                let c_str = std::ffi::CStr::from_ptr(error_message);
+                let error_string = c_str.to_string_lossy().into_owned();
+                return Err(GpuSpatialError::Init(error_string));
+            }
+        }
+        Ok(GpuSpatialRefinerWrapper {
+            refiner,
+            _runtime: runtime.clone(),
+        })
+    }
+
+    pub fn clear(&self) {
+        log::debug!("DEBUG FFI: clear called");
+        if let Some(clear_fn) = self.refiner.clear {
+            unsafe {
+                clear_fn(&self.refiner as *const _ as *mut _);
+            }
+            log::debug!("DEBUG FFI: clear completed");
+        }
+    }
 
-        joiner.create_context(&mut ctx);
-
-        let point_values = &[
-            Some("POINT (30 20)"), // poly0
-            Some("POINT (20 20)"), // poly1
-            Some("POINT (1 1)"),   // poly2
-            Some("POINT (70 70)"),
-            Some("POINT (55 35)"), // poly4
-        ];
-        let points = create_array_storage(point_values, &WKB_GEOMETRY);
-
-        // array_index_offset offsets the result of stream indices
-        let array_index_offset = 0;
-        joiner
-            .push_stream(
-                &mut ctx,
-                &points,
-                0,
-                points.len().try_into().unwrap(),
-                GpuSpatialPredicateWrapper::Intersects,
-                array_index_offset,
-            )
-            .expect("Failed to push building");
-
-        let build_indices = joiner.get_build_indices_buffer(&mut ctx);
-        let stream_indices = joiner.get_stream_indices_buffer(&mut ctx);
-
-        let mut result_pairs: Vec<(u32, u32)> = Vec::new();
-
-        for (build_index, stream_index) in build_indices.iter().zip(stream_indices.iter()) {
-            result_pairs.push((*build_index, *stream_index));
+    /// # Loads a build array into the GPU spatial refiner
+    /// This function loads an array of geometries into the GPU spatial refiner
+    /// for parsing and loading on the GPU side.
+    /// # Arguments
+    /// * `array` - The array of geometries to load.
+    /// # Returns
+    /// * `Result<(), GpuSpatialError>` - Ok if successful, Err if an error occurred.
+    pub fn push_build(&self, array: &ArrayRef) -> Result<(), GpuSpatialError> {
+        log::debug!("DEBUG FFI: push_build called with array={}", array.len(),);
+
+        let (ffi_array, ffi_schema) = arrow_array::ffi::to_ffi(&array.to_data())?;
+        log::debug!("DEBUG FFI: FFI conversion successful");
+        if let Some(load_fn) = self.refiner.push_build {
+            unsafe {
+                let ffi_array_ptr: *const ArrowArray =
+                    transmute(&ffi_array as *const FFI_ArrowArray);
+                let ffi_schema_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema as *const FFI_ArrowSchema);
+                log::debug!("DEBUG FFI: Calling C++ refine function");
+                let _new_len: u32 = 0;
+                if load_fn(
+                    &self.refiner as *const _ as *mut _,
+                    ffi_schema_ptr as *mut _,
+                    ffi_array_ptr as *mut _,
+                ) != 0
+                {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: push_build failed: {}", error_string);
+                    return Err(GpuSpatialError::PushBuild(error_string));
+                }
+                log::debug!("DEBUG FFI: push_build C++ call succeeded");
+            }
+        }
+        Ok(())
+    }
+
+    pub fn finish_building(&self) -> Result<(), GpuSpatialError> {
+        log::debug!("DEBUG FFI: finish_building called");
+
+        if let Some(finish_building_fn) = self.refiner.finish_building {
+            unsafe {
+                if finish_building_fn(&self.refiner as *const _ as *mut _) != 0 {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: finish_building failed: {}", error_string);
+                    return Err(GpuSpatialError::FinishBuild(error_string));
+                }
+                log::debug!("DEBUG FFI: finish_building C++ call succeeded");
+            }
         }
+        Ok(())
+    }
 
-        let kernels = scalar_kernels();
-
-        // Iterate through the vector and find the one named "st_intersects"
-        let st_intersects = kernels
-            .into_iter()
-            .find(|(name, _)| *name == "st_intersects")
-            .map(|(_, kernel_ref)| kernel_ref)
-            .unwrap();
-
-        let sedona_type = SedonaType::Wkb(Edges::Planar, lnglat());
-        let udf = SedonaScalarUDF::from_kernel("st_intersects", st_intersects);
-        let tester =
-            ScalarUdfTester::new(udf.into(), vec![sedona_type.clone(), sedona_type.clone()]);
-
-        let mut answer_pairs: Vec<(u32, u32)> = Vec::new();
-
-        for (poly_index, poly) in polygon_values.iter().enumerate() {
-            for (point_index, point) in point_values.iter().enumerate() {
-                let result = tester
-                    .invoke_scalar_scalar(poly.unwrap(), point.unwrap())
-                    .unwrap();
-                if result == true.into() {
-                    answer_pairs.push((poly_index as u32, point_index as u32));
+    /// # Refines candidate pairs using the GPU spatial refiner
+    /// This function refines candidate pairs of geometries using the GPU spatial refiner.
+    /// It takes the probe side array of geometries and a predicate, and outputs the refined pairs of
+    /// indices that satisfy the predicate.
+    /// # Arguments
+    /// * `array` - The array of geometries on the probe side.
+    /// * `predicate` - The spatial relation predicate to use for refinement.
+    /// * `build_indices` - The input/output vector of indices for the first array.
+    /// * `probe_indices` - The input/output vector of indices for the second array.
+    /// # Returns
+    /// * `Result<(), GpuSpatialError>` - Ok if successful, Err if an error occurred.
+    pub fn refine_loaded(
+        &self,
+        array: &ArrayRef,
+        predicate: GpuSpatialRelationPredicateWrapper,
+        build_indices: &mut Vec<u32>,
+        probe_indices: &mut Vec<u32>,
+    ) -> Result<(), GpuSpatialError> {
+        log::debug!(
+            "DEBUG FFI: refine called with array={}, indices={}, predicate={:?}",
+            array.len(),
+            build_indices.len(),
+            predicate
+        );
+
+        let (ffi_array, ffi_schema) = arrow_array::ffi::to_ffi(&array.to_data())?;
+
+        log::debug!("DEBUG FFI: FFI conversion successful");
+
+        if let Some(refine_fn) = self.refiner.refine_loaded {
+            unsafe {
+                let ffi_array_ptr: *const ArrowArray =
+                    transmute(&ffi_array as *const FFI_ArrowArray);
+                let ffi_schema_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema as *const FFI_ArrowSchema);
+                log::debug!("DEBUG FFI: Calling C++ refine function");
+                let mut new_len: u32 = 0;
+                if refine_fn(
+                    &self.refiner as *const _ as *mut _,
+                    ffi_schema_ptr as *mut _,
+                    ffi_array_ptr as *mut _,
+                    predicate as c_uint,
+                    build_indices.as_mut_ptr(),
+                    probe_indices.as_mut_ptr(),
+                    build_indices.len() as u32,
+                    &mut new_len as *mut u32,
+                ) != 0
+                {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: refine failed: {}", error_string);
+                    return Err(GpuSpatialError::Refine(error_string));
                 }
+                log::debug!("DEBUG FFI: refine C++ call succeeded");
+                // Update the lengths of the output index vectors
+                build_indices.truncate(new_len as usize);
+                probe_indices.truncate(new_len as usize);
             }
         }
+        Ok(())
+    }
+    /// # Refines candidate pairs using the GPU spatial refiner
+    /// This function refines candidate pairs of geometries using the GPU spatial refiner.
+    /// It takes two arrays of geometries and a predicate, and outputs the refined pairs of
+    /// indices that satisfy the predicate.
+    /// # Arguments
+    /// * `array1` - The first array of geometries.
+    /// * `array2` - The second array of geometries.
+    /// * `predicate` - The spatial relation predicate to use for refinement.
+    /// * `indices1` - The input/output vector of indices for the first array.
+    /// * `indices2` - The input/output vector of indices for the second array.
+    /// # Returns
+    /// * `Result<(), GpuSpatialError>` - Ok if successful, Err if an error occurred.
+    pub fn refine(
+        &self,
+        array1: &ArrayRef,
+        array2: &ArrayRef,
+        predicate: GpuSpatialRelationPredicateWrapper,
+        indices1: &mut Vec<u32>,
+        indices2: &mut Vec<u32>,
+    ) -> Result<(), GpuSpatialError> {
+        log::debug!(
+            "DEBUG FFI: refine called with array1={}, array2={}, indices={}, predicate={:?}",
+            array1.len(),
+            array2.len(),
+            indices1.len(),
+            predicate
+        );
 
-        // Sort both vectors. The default sort on tuples compares element by element.
-        result_pairs.sort();
-        answer_pairs.sort();
+        let (ffi_array1, ffi_schema1) = arrow_array::ffi::to_ffi(&array1.to_data())?;
+        let (ffi_array2, ffi_schema2) = arrow_array::ffi::to_ffi(&array2.to_data())?;
 
-        // Assert that the two sorted vectors are equal.
-        assert_eq!(result_pairs, answer_pairs);
+        log::debug!("DEBUG FFI: FFI conversion successful");
 
-        joiner.destroy_context(&mut ctx);
-        joiner.release();
+        if let Some(refine_fn) = self.refiner.refine {
+            unsafe {
+                let ffi_array1_ptr: *const ArrowArray =
+                    transmute(&ffi_array1 as *const FFI_ArrowArray);
+                let ffi_schema1_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema1 as *const FFI_ArrowSchema);
+                let ffi_array2_ptr: *const ArrowArray =
+                    transmute(&ffi_array2 as *const FFI_ArrowArray);
+                let ffi_schema2_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema2 as *const FFI_ArrowSchema);
+                log::debug!("DEBUG FFI: Calling C++ refine function");
+                let mut new_len: u32 = 0;
+                if refine_fn(
+                    &self.refiner as *const _ as *mut _,
+                    ffi_schema1_ptr as *mut _,
+                    ffi_array1_ptr as *mut _,
+                    ffi_schema2_ptr as *mut _,
+                    ffi_array2_ptr as *mut _,
+                    predicate as c_uint,
+                    indices1.as_mut_ptr(),
+                    indices2.as_mut_ptr(),
+                    indices1.len() as u32,
+                    &mut new_len as *mut u32,
+                ) != 0
+                {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: refine failed: {}", error_string);
+                    return Err(GpuSpatialError::Refine(error_string));
+                }
+                log::debug!("DEBUG FFI: refine C++ call succeeded");
+                // Update the lengths of the output index vectors
+                indices1.truncate(new_len as usize);
+                indices2.truncate(new_len as usize);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Default for GpuSpatialRefinerWrapper {
+    fn default() -> Self {
+        GpuSpatialRefinerWrapper {
+            refiner: SedonaSpatialRefiner {
+                clear: None,
+                push_build: None,
+                finish_building: None,
+                refine_loaded: None,
+                refine: None,
+                get_last_error: None,
+                release: None,
+                private_data: std::ptr::null_mut(),
+            },
+            _runtime: Arc::new(Mutex::new(GpuSpatialRuntimeWrapper::default())),
+        }
+    }
+}
+
+impl Drop for GpuSpatialRefinerWrapper {
+    fn drop(&mut self) {
+        // Call the release function if it exists
+        if let Some(release_fn) = self.refiner.release {
+            unsafe {
+                release_fn(&mut self.refiner as *mut _);
+            }
+        }
     }
 }