diff --git a/.github/workflows/rust-gpu.yml b/.github/workflows/rust-gpu.yml
index fc54e4d32..f8aacdd44 100644
--- a/.github/workflows/rust-gpu.yml
+++ b/.github/workflows/rust-gpu.yml
@@ -27,7 +27,6 @@ on:
       - main
     paths:
       - 'c/sedona-libgpuspatial/**'
-      - 'rust/sedona-spatial-join-gpu/**'
       - '.github/workflows/rust-gpu.yml'
 
   push:
@@ -35,7 +34,6 @@ on:
       - main
     paths:
       - 'c/sedona-libgpuspatial/**'
-      - 'rust/sedona-spatial-join-gpu/**'
       - '.github/workflows/rust-gpu.yml'
 
 concurrency:
@@ -66,7 +64,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        name: [ "clippy", "docs", "test", "build" ]
+        name: [ "build_tests", "build_lib", "build_package" ]
 
     name: "${{ matrix.name }}"
     runs-on: ubuntu-latest
@@ -181,15 +179,6 @@ jobs:
           # Bump the number at the end of this line to force a new dependency build
           key: vcpkg-installed-${{ runner.os }}-${{ runner.arch }}-${{ env.VCPKG_REF }}-3
 
-      # Install vcpkg dependencies from vcpkg.json manifest
-      - name: Install vcpkg dependencies
-        if: steps.cache-vcpkg.outputs.cache-hit != 'true'
-        run: |
-          ./vcpkg/vcpkg install abseil openssl
-          # Clean up vcpkg buildtrees and downloads to save space
-          rm -rf vcpkg/buildtrees
-          rm -rf vcpkg/downloads
-
       - name: Use stable Rust
         id: rust
         run: |
@@ -200,10 +189,20 @@ jobs:
         with:
           prefix-key: "rust-gpu-v4"
 
+      - name: Build libgpuspatial Tests
+        if: matrix.name == 'build_tests'
+        run: |
+          echo "=== Building libgpuspatial tests ==="
+          cd c/sedona-libgpuspatial/libgpuspatial
+          mkdir build
+          cmake --preset=default-with-tests -S . -B build
+          cmake --build build --target all
+
       # Build WITH GPU feature to compile CUDA code
       # CUDA compilation (nvcc) works without GPU hardware
       # Only GPU runtime execution requires actual GPU
       - name: Build libgpuspatial (with CUDA compilation)
+        if: matrix.name == 'build_lib'
         run: |
           echo "=== Building libgpuspatial WITH GPU feature ==="
           echo "Compiling CUDA code using nvcc (no GPU hardware needed for compilation)"
@@ -215,10 +214,7 @@ jobs:
           # --lib builds only the library, not test binaries
           cargo build --locked --package sedona-libgpuspatial --lib --features gpu --verbose
 
-      - name: Build libgpuspatial Tests
+      - name: Build GPU Spatial Join Package
+        if: matrix.name == 'build_package'
         run: |
-          echo "=== Building libgpuspatial tests ==="
-          cd c/sedona-libgpuspatial/libgpuspatial
-          mkdir build
-          cmake --preset=default-with-tests -S . -B build
-          cmake --build build --target all
+          cargo build --workspace --package sedona-spatial-join --features gpu --verbose
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index f2c4e5471..b74163b16 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -151,7 +151,7 @@ jobs:
       - name: Test
         if: matrix.name == 'test'
         run: |
-          cargo test --workspace --all-targets --all-features
+          cargo test --workspace --all-targets # Test all default features but GPU
           # Clean up intermediate build artifacts to free disk space aggressively
           cargo clean -p sedona-s2geography
           rm -rf target/debug/deps
diff --git a/.gitignore b/.gitignore
index 232ccf0f1..88819273f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,4 @@ __pycache__
 dev/release/.env
 
 /.luarc.json
+venv/
diff --git a/Cargo.lock b/Cargo.lock
index bafb1b7ec..d84326486 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1533,6 +1533,16 @@ dependencies = [
  "darling_macro 0.13.4",
 ]
 
+[[package]]
+name = "darling"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee"
+dependencies = [
+ "darling_core 0.20.11",
+ "darling_macro 0.20.11",
+]
+
 [[package]]
 name = "darling"
 version = "0.23.0"
@@ -1557,6 +1567,20 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "darling_core"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim 0.11.1",
+ "syn 2.0.114",
+]
+
 [[package]]
 name = "darling_core"
 version = "0.23.0"
@@ -1581,6 +1605,17 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "darling_macro"
+version = "0.20.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead"
+dependencies = [
+ "darling_core 0.20.11",
+ "quote",
+ "syn 2.0.114",
+]
+
 [[package]]
 name = "darling_macro"
 version = "0.23.0"
@@ -3874,6 +3909,29 @@ dependencies = [
  "syn 2.0.114",
 ]
 
+[[package]]
+name = "nvml-wrapper"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c9bff0aa1d48904a1385ea2a8b97576fbdcbc9a3cfccd0d31fe978e1c4038c5"
+dependencies = [
+ "bitflags",
+ "libloading 0.8.9",
+ "nvml-wrapper-sys",
+ "static_assertions",
+ "thiserror 1.0.69",
+ "wrapcenum-derive",
+]
+
+[[package]]
+name = "nvml-wrapper-sys"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "698d45156f28781a4e79652b6ebe2eaa0589057d588d3aec1333f6466f13fcb5"
+dependencies = [
+ "libloading 0.8.9",
+]
+
 [[package]]
 name = "object"
 version = "0.32.2"
@@ -5278,13 +5336,16 @@ dependencies = [
  "arrow-schema",
  "bindgen",
  "cmake",
+ "geo",
  "log",
+ "nvml-wrapper",
  "sedona-expr",
  "sedona-geos",
  "sedona-schema",
  "sedona-testing",
  "thiserror 2.0.17",
  "which",
+ "wkt 0.14.0",
 ]
 
 [[package]]
@@ -5387,6 +5448,7 @@ dependencies = [
  "arrow",
  "arrow-array",
  "arrow-schema",
+ "async-trait",
  "criterion",
  "datafusion",
  "datafusion-common",
@@ -5403,6 +5465,7 @@ dependencies = [
  "geo-traits",
  "geo-types",
  "geos",
+ "log",
  "once_cell",
  "parking_lot",
  "pin-project-lite",
@@ -5416,6 +5479,7 @@ dependencies = [
  "sedona-geo-traits-ext",
  "sedona-geometry",
  "sedona-geos",
+ "sedona-libgpuspatial",
  "sedona-schema",
  "sedona-testing",
  "sedona-tg",
@@ -5484,6 +5548,7 @@ dependencies = [
  "datafusion-common",
  "datafusion-expr",
  "datafusion-ffi",
+ "env_logger 0.11.8",
  "futures",
  "libmimalloc-sys",
  "mimalloc",
@@ -5757,6 +5822,12 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
 [[package]]
 name = "strsim"
 version = "0.10.0"
@@ -6740,6 +6811,18 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "wrapcenum-derive"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a76ff259533532054cfbaefb115c613203c73707017459206380f03b3b3f266e"
+dependencies = [
+ "darling 0.20.11",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.114",
+]
+
 [[package]]
 name = "writeable"
 version = "0.6.2"
diff --git a/Cargo.toml b/Cargo.toml
index 6c20d23df..bca9a5b27 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -151,6 +151,7 @@ sedona-testing = { version = "0.3.0", path = "rust/sedona-testing" }
 # C wrapper crates
 sedona-geoarrow-c = { version = "0.3.0", path = "c/sedona-geoarrow-c" }
 sedona-geos = { version = "0.3.0", path = "c/sedona-geos" }
+sedona-libgpuspatial = { version = "0.3.0", path = "c/sedona-libgpuspatial" }
 sedona-proj = { version = "0.3.0", path = "c/sedona-proj", default-features = false }
 sedona-s2geography = { version = "0.3.0", path = "c/sedona-s2geography" }
 sedona-tg = { version = "0.3.0", path = "c/sedona-tg" }
diff --git a/c/sedona-geoarrow-c/build.rs b/c/sedona-geoarrow-c/build.rs
index 4d8658415..871a22683 100644
--- a/c/sedona-geoarrow-c/build.rs
+++ b/c/sedona-geoarrow-c/build.rs
@@ -27,6 +27,7 @@ fn main() {
         .include("src/")
         .flag("-DGEOARROW_NAMESPACE=SedonaDB")
         .flag("-DNANOARROW_NAMESPACE=SedonaDB")
+        .flag("-Wno-type-limits")
         .compile("geoarrow");
 
     cc::Build::new()
diff --git a/c/sedona-libgpuspatial/Cargo.toml b/c/sedona-libgpuspatial/Cargo.toml
index f271cd57a..efde2d986 100644
--- a/c/sedona-libgpuspatial/Cargo.toml
+++ b/c/sedona-libgpuspatial/Cargo.toml
@@ -40,8 +40,11 @@ which = "8.0"
 arrow-array = { workspace = true, features = ["ffi"] }
 arrow-schema = { workspace = true }
 thiserror = { workspace = true }
+geo = { workspace = true }
+wkt = { workspace = true }
 log = "0.4"
 sedona-schema = { path = "../../rust/sedona-schema" }
+nvml-wrapper = "0.10.0"
 
 [dev-dependencies]
 sedona-expr = { path = "../../rust/sedona-expr" }
diff --git a/c/sedona-libgpuspatial/build.rs b/c/sedona-libgpuspatial/build.rs
index 6bf5f3f8b..ba2daae95 100644
--- a/c/sedona-libgpuspatial/build.rs
+++ b/c/sedona-libgpuspatial/build.rs
@@ -119,10 +119,17 @@ fn main() {
                 println!("cargo:warning=CMAKE_CUDA_ARCHITECTURES environment variable not set. Defaulting to '86;89'.");
                 "86;89".to_string()
             });
+        // Determine the build profile to match Cargo's debug/release mode
+        let profile_mode = if cfg!(debug_assertions) {
+            "Debug"
+        } else {
+            "Release"
+        };
+
         let dst = cmake::Config::new("./libgpuspatial")
             .define("CMAKE_CUDA_ARCHITECTURES", cuda_architectures)
             .define("CMAKE_POLICY_VERSION_MINIMUM", "3.5") // Allow older CMake versions
-            .define("LIBGPUSPATIAL_LOGGING_LEVEL", "WARN") // Set logging level
+            .define("LIBGPUSPATIAL_LOGGING_LEVEL", "INFO") // Set logging level
             .build();
         let include_path = dst.join("include");
         println!(
@@ -157,6 +164,17 @@ fn main() {
         println!("cargo:rustc-link-lib=static=gpuspatial");
         println!("cargo:rustc-link-lib=static=rmm");
         println!("cargo:rustc-link-lib=static=rapids_logger");
+        // Use the 'd' suffix for the debug build of spdlog (libspdlogd.a)
+        let spdlog_lib_name = if cfg!(debug_assertions) {
+            "spdlogd"
+        } else {
+            "spdlog"
+        };
+        println!(
+            "cargo:warning=Linking spdlog in {} mode: lib{}.a",
+            profile_mode, spdlog_lib_name
+        );
+        println!("cargo:rustc-link-lib=static={}", spdlog_lib_name);
         println!("cargo:rustc-link-lib=static=geoarrow");
         println!("cargo:rustc-link-lib=static=nanoarrow");
         println!("cargo:rustc-link-lib=stdc++");
diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
index 773cf2061..c97438d4e 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakeLists.txt
@@ -132,8 +132,12 @@ config_shaders(PTX_FILES)
 
 message("-- Config shader PTX files ${PTX_FILES}")
 
-add_library(gpuspatial src/rt/rt_engine.cpp src/relate_engine.cu src/spatial_joiner.cu
-                       ${PTX_FILES})
+add_library(gpuspatial
+            src/rt/rt_engine.cpp
+            src/relate_engine.cu
+            src/rt_spatial_index.cu
+            src/rt_spatial_refiner.cu
+            ${PTX_FILES})
 
 # Link libraries
 target_link_libraries(gpuspatial
@@ -142,8 +146,7 @@ target_link_libraries(gpuspatial
                              cuda
                              rmm::rmm
                              rapids_logger::rapids_logger
-                             OptiX
-                      PRIVATE zstd)
+                             OptiX)
 
 # Set include directories
 target_include_directories(gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json b/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
index 55248ea7f..0cb8a7fbb 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
+++ b/c/sedona-libgpuspatial/libgpuspatial/CMakePresets.json
@@ -31,7 +31,7 @@
             "name": "default",
             "configurePreset": "default-with-tests",
             "environment": {
-                "GPUSPATIAL_TEST_DIR": "${sourceDir}/test_data"
+                "GPUSPATIAL_TEST_DIR": "${sourceDir}/test/data"
             }
         }
     ]
diff --git a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
index 1f4d53c22..a7314c151 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_geoarrow.cmake
@@ -47,6 +47,7 @@ function(find_and_configure_geoarrow)
                   "BUILD_SHARED_LIBS OFF"
                   ${_exclude_from_all})
   set_target_properties(geoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  target_compile_options(geoarrow PRIVATE -Wno-conversion)
   rapids_export_find_package_root(BUILD
                                   geoarrow
                                   "${geoarrow_BINARY_DIR}"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
index ecc3b4179..61932beb6 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/cmake/thirdparty/get_nanoarrow.cmake
@@ -48,6 +48,10 @@ function(find_and_configure_nanoarrow)
                   "NANOARROW_NAMESPACE gpuspatial"
                   ${_exclude_from_all})
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  if(TARGET nanoarrow_ipc) # Tests need this
+    target_compile_options(nanoarrow_ipc PRIVATE -Wno-conversion)
+  endif()
+  target_compile_options(nanoarrow PRIVATE -Wno-conversion)
   rapids_export_find_package_root(BUILD
                                   nanoarrow
                                   "${nanoarrow_BINARY_DIR}"
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh
index 9fb33fa8e..0badf7c53 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/box.cuh
@@ -86,22 +86,26 @@ class Box {
   }
 
   DEV_HOST_INLINE OptixAabb ToOptixAabb() const {
-    OptixAabb aabb;
+    OptixAabb aabb{0, 0, 0, 0, 0, 0};
 
-    memset(&aabb, 0, sizeof(OptixAabb));
-    if (sizeof(scalar_t) == sizeof(float)) {
+    if constexpr (sizeof(scalar_t) == sizeof(float)) {
       for (int dim = 0; dim < n_dim; dim++) {
-        reinterpret_cast<float*>(&aabb.minX)[dim] = min_.get_coordinate(dim);
-        reinterpret_cast<float*>(&aabb.maxX)[dim] = max_.get_coordinate(dim);
+        auto min_val = min_.get_coordinate(dim);
+        auto max_val = max_.get_coordinate(dim);
+        if (min_val == max_val) {
+          min_val = next_float_from_double(min_val, -1, 2);
+          max_val = next_float_from_double(max_val, 1, 2);
+        }
+        (&aabb.minX)[dim] = min_val;
+        (&aabb.maxX)[dim] = max_val;
       }
     } else {
       for (int dim = 0; dim < n_dim; dim++) {
         auto min_val = min_.get_coordinate(dim);
         auto max_val = max_.get_coordinate(dim);
 
-        reinterpret_cast<float*>(&aabb.minX)[dim] =
-            next_float_from_double(min_val, -1, 2);
-        reinterpret_cast<float*>(&aabb.maxX)[dim] = next_float_from_double(max_val, 1, 2);
+        (&aabb.minX)[dim] = next_float_from_double(min_val, -1, 2);
+        (&aabb.maxX)[dim] = next_float_from_double(max_val, 1, 2);
       }
     }
     return aabb;
@@ -137,6 +141,8 @@ class Box {
 
   DEV_HOST_INLINE scalar_t get_min(int dim) const { return min_.get_coordinate(dim); }
 
+  DEV_HOST_INLINE bool valid() const { return !min_.empty() && !max_.empty(); }
+
   DEV_HOST_INLINE const point_t& get_max() const { return max_; }
 
   DEV_HOST_INLINE scalar_t get_max(int dim) const { return max_.get_coordinate(dim); }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh
index 500d9def5..f9ababaaa 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/geom/point.cuh
@@ -73,7 +73,14 @@ class Point {
 
   DEV_HOST_INLINE const scalar_t* get_data() const { return &data_.x; }
 
-  DEV_HOST_INLINE bool empty() const { return std::isnan(data_.x); }
+  DEV_HOST_INLINE bool empty() const {
+    for (int dim = 0; dim < n_dim; dim++) {
+      if (std::isnan(get_coordinate(dim))) {
+        return true;
+      }
+    }
+    return false;
+  }
 
   DEV_HOST_INLINE void set_empty() {
     for (int dim = 0; dim < n_dim; dim++) {
@@ -102,11 +109,7 @@ class Point {
    * @brief Provides const access to the x-coordinate.
    * This method is only available if N_DIM >= 1.
    */
-  DEV_HOST_INLINE const scalar_t& x() const {
-    if constexpr (N_DIM >= 1) {
-      return data_.x;
-    }
-  }
+  DEV_HOST_INLINE const scalar_t& x() const { return data_.x; }
 
   /**
    * @brief Provides access to the y-coordinate.
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
index b31af58b0..587e81121 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/gpuspatial_c.h
@@ -14,60 +14,157 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+#include <stdbool.h>
 #include <stdint.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-struct GpuSpatialJoinerConfig {
-  uint32_t concurrency;
+struct ArrowSchema;
+struct ArrowArray;
+
+// Interfaces for ray-tracing engine (OptiX)
+struct GpuSpatialRuntimeConfig {
+  /** Path to PTX files */
   const char* ptx_root;
+  /** Device ID to use, 0 is the first GPU */
+  int device_id;
+  /** Ratio of initial memory pool size to total GPU memory, between 0.0 and 1.0; zero is
+   * effectively disable async memory allocation and using cudaMalloc */
+  float cuda_init_memory_pool_ratio;
 };
 
-struct GpuSpatialJoinerContext {
-  const char* last_error;  // Pointer to std::string to store last error message
-  void* private_data;      // GPUSpatial context
-  void* build_indices;     // Pointer to std::vector<uint32_t> to store results
-  void* stream_indices;
+struct GpuSpatialRuntime {
+  /** Initialize the runtime (OptiX) with the given configuration
+   * @return 0 on success, non-zero on failure
+   */
+  int (*init)(struct GpuSpatialRuntime* self, struct GpuSpatialRuntimeConfig* config);
+  void (*release)(struct GpuSpatialRuntime* self);
+  const char* (*get_last_error)(struct GpuSpatialRuntime* self);
+  void* private_data;
 };
 
-enum GpuSpatialPredicate {
-  GpuSpatialPredicateEquals = 0,
-  GpuSpatialPredicateDisjoint,
-  GpuSpatialPredicateTouches,
-  GpuSpatialPredicateContains,
-  GpuSpatialPredicateCovers,
-  GpuSpatialPredicateIntersects,
-  GpuSpatialPredicateWithin,
-  GpuSpatialPredicateCoveredBy
+/** Create an instance of GpuSpatialRuntime */
+void GpuSpatialRuntimeCreate(struct GpuSpatialRuntime* runtime);
+
+struct GpuSpatialIndexConfig {
+  /** Pointer to an initialized GpuSpatialRuntime struct */
+  struct GpuSpatialRuntime* runtime;
+  /** How many threads will concurrently call Probe method */
+  uint32_t concurrency;
+};
+
+// An opaque context for concurrent probing
+struct SedonaSpatialIndexContext {
+  void* private_data;
+};
+
+struct SedonaFloatIndex2D {
+  /** Clear the spatial index, removing all built data */
+  int (*clear)(struct SedonaFloatIndex2D* self);
+  /** Create a new context for concurrent probing */
+  void (*create_context)(struct SedonaSpatialIndexContext* context);
+  /** Destroy a previously created context */
+  void (*destroy_context)(struct SedonaSpatialIndexContext* context);
+  /** Push rectangles for building the spatial index, each rectangle is represented by 4
+   * floats: [min_x, min_y, max_x, max_y] Points can also be indexed by providing [x, y,
+   * x, y] but points and rectangles cannot be mixed
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*push_build)(struct SedonaFloatIndex2D* self, const float* buf, uint32_t n_rects);
+  /**
+   * Finish building the spatial index after all rectangles have been pushed
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*finish_building)(struct SedonaFloatIndex2D* self);
+  /**
+   * Probe the spatial index with the given rectangles, each rectangle is represented by 4
+   * floats: [min_x, min_y, max_x, max_y] Points can also be probed by providing [x, y, x,
+   * y] but points and rectangles cannot be mixed in one Probe call. The results of the
+   * probe will be stored in the context.
+   *
+   * @return 0 on success, non-zero on failure
+   */
+  int (*probe)(struct SedonaFloatIndex2D* self, struct SedonaSpatialIndexContext* context,
+               const float* buf, uint32_t n_rects);
+  /** Get the build indices buffer from the context
+   *
+   * @return A pointer to the buffer and its length
+   */
+  void (*get_build_indices_buffer)(struct SedonaSpatialIndexContext* context,
+                                   uint32_t** build_indices,
+                                   uint32_t* build_indices_length);
+  /** Get the probe indices buffer from the context
+   *
+   * @return A pointer to the buffer and its length
+   */
+  void (*get_probe_indices_buffer)(struct SedonaSpatialIndexContext* context,
+                                   uint32_t** probe_indices,
+                                   uint32_t* probe_indices_length);
+  const char* (*get_last_error)(struct SedonaFloatIndex2D* self);
+  const char* (*context_get_last_error)(struct SedonaSpatialIndexContext* context);
+  /** Release the spatial index and free all resources */
+  void (*release)(struct SedonaFloatIndex2D* self);
+  void* private_data;
 };
 
-struct GpuSpatialJoiner {
-  int (*init)(struct GpuSpatialJoiner* self, struct GpuSpatialJoinerConfig* config);
-  void (*clear)(struct GpuSpatialJoiner* self);
-  void (*create_context)(struct GpuSpatialJoiner* self,
-                         struct GpuSpatialJoinerContext* context);
-  void (*destroy_context)(struct GpuSpatialJoinerContext* context);
-  int (*push_build)(struct GpuSpatialJoiner* self, const struct ArrowSchema* schema,
-                    const struct ArrowArray* array, int64_t offset, int64_t length);
-  int (*finish_building)(struct GpuSpatialJoiner* self);
-  int (*push_stream)(struct GpuSpatialJoiner* self,
-                     struct GpuSpatialJoinerContext* context,
-                     const struct ArrowSchema* schema, const struct ArrowArray* array,
-                     int64_t offset, int64_t length, enum GpuSpatialPredicate predicate,
-                     int32_t array_index_offset);
-  void (*get_build_indices_buffer)(struct GpuSpatialJoinerContext* context,
-                                   void** build_indices, uint32_t* build_indices_length);
-  void (*get_stream_indices_buffer)(struct GpuSpatialJoinerContext* context,
-                                    void** stream_indices,
-                                    uint32_t* stream_indices_length);
-  void (*release)(struct GpuSpatialJoiner* self);
+int GpuSpatialIndexFloat2DCreate(struct SedonaFloatIndex2D* index,
+                                 const struct GpuSpatialIndexConfig* config);
+
+struct GpuSpatialRefinerConfig {
+  /** Pointer to an initialized GpuSpatialRuntime struct */
+  struct GpuSpatialRuntime* runtime;
+  /** How many threads will concurrently call Probe method */
+  uint32_t concurrency;
+  /** Whether to compress the BVH structures to save memory */
+  bool compress_bvh;
+  /** Number of batches to pipeline for parsing and refinement; setting to 1 disables
+   * pipelining */
+  uint32_t pipeline_batches;
+};
+
+enum SedonaSpatialRelationPredicate {
+  SedonaSpatialPredicateEquals = 0,
+  SedonaSpatialPredicateDisjoint,
+  SedonaSpatialPredicateTouches,
+  SedonaSpatialPredicateContains,
+  SedonaSpatialPredicateCovers,
+  SedonaSpatialPredicateIntersects,
+  SedonaSpatialPredicateWithin,
+  SedonaSpatialPredicateCoveredBy
+};
+
+struct SedonaSpatialRefiner {
+  int (*clear)(struct SedonaSpatialRefiner* self);
+
+  int (*push_build)(struct SedonaSpatialRefiner* self,
+                    const struct ArrowSchema* build_schema,
+                    const struct ArrowArray* build_array);
+
+  int (*finish_building)(struct SedonaSpatialRefiner* self);
+
+  int (*refine_loaded)(struct SedonaSpatialRefiner* self,
+                       const struct ArrowSchema* probe_schema,
+                       const struct ArrowArray* probe_array,
+                       enum SedonaSpatialRelationPredicate predicate,
+                       uint32_t* build_indices, uint32_t* probe_indices,
+                       uint32_t indices_size, uint32_t* new_indices_size);
+
+  int (*refine)(struct SedonaSpatialRefiner* self, const struct ArrowSchema* schema1,
+                const struct ArrowArray* array1, const struct ArrowSchema* schema2,
+                const struct ArrowArray* array2,
+                enum SedonaSpatialRelationPredicate predicate, uint32_t* indices1,
+                uint32_t* indices2, uint32_t indices_size, uint32_t* new_indices_size);
+  const char* (*get_last_error)(struct SedonaSpatialRefiner* self);
+  void (*release)(struct SedonaSpatialRefiner* self);
   void* private_data;
-  const char* last_error;
 };
 
-void GpuSpatialJoinerCreate(struct GpuSpatialJoiner* index);
+int GpuSpatialRefinerCreate(struct SedonaSpatialRefiner* refiner,
+                            const struct GpuSpatialRefinerConfig* config);
 #ifdef __cplusplus
 }
 #endif
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp
deleted file mode 100644
index 5dab852d1..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/geometry_grouper.hpp
+++ /dev/null
@@ -1,294 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/utils/launcher.h"
-#include "gpuspatial/utils/morton_code.h"
-
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-#include "rmm/exec_policy.hpp"
-
-#include <thrust/sequence.h>
-#include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
-
-#include <memory>
-
-namespace gpuspatial {
-template <typename POINT_T, typename INDEX_T>
-class GeometryGrouper {
-  using box_t = Box<POINT_T>;
-  static constexpr int n_dim = POINT_T::n_dim;
-  using scalar_t = typename POINT_T::scalar_t;
-
- public:
-  void Group(const rmm::cuda_stream_view& stream,
-             const DeviceGeometries<POINT_T, INDEX_T>& geometries,
-             uint32_t geoms_per_aabb) {
-    switch (geometries.get_geometry_type()) {
-      case GeometryType::kPoint: {
-        Group(
-            stream,
-            geometries.template GetGeometryArrayView<PointArrayView<POINT_T, INDEX_T>>(),
-            geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiPoint: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<MultiPointArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kLineString: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<LineStringArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiLineString: {
-        Group(stream,
-              geometries.template GetGeometryArrayView<
-                  MultiLineStringArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kPolygon: {
-        Group(stream,
-              geometries
-                  .template GetGeometryArrayView<PolygonArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kMultiPolygon: {
-        Group(
-            stream,
-            geometries
-                .template GetGeometryArrayView<MultiPolygonArrayView<POINT_T, INDEX_T>>(),
-            geoms_per_aabb);
-        break;
-      }
-      case GeometryType::kBox: {
-        Group(stream,
-              geometries.template GetGeometryArrayView<BoxArrayView<POINT_T, INDEX_T>>(),
-              geoms_per_aabb);
-        break;
-      }
-      default:
-        assert(false);
-    }
-  }
-
-  template <typename GEOMETRY_ARRAY_T>
-  void Group(const rmm::cuda_stream_view& stream, const GEOMETRY_ARRAY_T& geometries,
-             uint32_t geoms_per_aabb) {
-    rmm::device_uvector<INDEX_T> morton_codes(geometries.size(), stream);
-    POINT_T min_world_corner, max_world_corner;
-
-    min_world_corner.set_max();
-    max_world_corner.set_min();
-
-    for (int dim = 0; dim < n_dim; dim++) {
-      auto min_val = thrust::transform_reduce(
-          rmm::exec_policy_nosync(stream), thrust::make_counting_iterator<INDEX_T>(0),
-          thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-          [=] __host__ __device__(INDEX_T i) {
-            const auto& geom = geometries[i];
-            const auto& mbr = geom.get_mbr();
-
-            return mbr.get_min(dim);
-          },
-          std::numeric_limits<scalar_t>::max(), thrust::minimum<scalar_t>());
-
-      auto max_val = thrust::transform_reduce(
-          rmm::exec_policy_nosync(stream), thrust::make_counting_iterator<INDEX_T>(0),
-          thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-          [=] __host__ __device__(INDEX_T i) {
-            const auto& geom = geometries[i];
-            const auto& mbr = geom.get_mbr();
-
-            return mbr.get_max(dim);
-          },
-          std::numeric_limits<scalar_t>::lowest(), thrust::maximum<scalar_t>());
-      min_world_corner.set_coordinate(dim, min_val);
-      max_world_corner.set_coordinate(dim, max_val);
-    }
-
-    // compute morton codes and reorder indices
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<INDEX_T>(0),
-                      thrust::make_counting_iterator<INDEX_T>(geometries.size()),
-                      morton_codes.begin(), [=] __device__(INDEX_T i) {
-                        const auto& geom = geometries[i];
-                        const auto& mbr = geom.get_mbr();
-                        auto p = mbr.centroid();
-                        POINT_T norm_p;
-
-                        for (int dim = 0; dim < n_dim; dim++) {
-                          auto min_val = min_world_corner.get_coordinate(dim);
-                          auto max_val = max_world_corner.get_coordinate(dim);
-                          auto extent = min_val == max_val ? 1 : max_val - min_val;
-                          auto norm_val = (p.get_coordinate(dim) - min_val) / extent;
-                          norm_p.set_coordinate(dim, norm_val);
-                        }
-                        return detail::morton_code(norm_p.get_vec());
-                      });
-    reordered_indices_ =
-        std::make_unique<rmm::device_uvector<INDEX_T>>(geometries.size(), stream);
-    thrust::sequence(rmm::exec_policy_nosync(stream), reordered_indices_->begin(),
-                     reordered_indices_->end());
-    thrust::sort_by_key(rmm::exec_policy_nosync(stream), morton_codes.begin(),
-                        morton_codes.end(), reordered_indices_->begin());
-
-    auto n_aabbs = (geometries.size() + geoms_per_aabb - 1) / geoms_per_aabb;
-    aabbs_ = std::make_unique<rmm::device_uvector<OptixAabb>>(n_aabbs, stream);
-    OptixAabb empty_aabb;
-
-    if (n_dim == 2) {
-      empty_aabb = OptixAabb{
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::max(),    0,
-          std::numeric_limits<float>::lowest(), std::numeric_limits<float>::lowest(), 0};
-    } else if (n_dim == 3) {
-      empty_aabb = OptixAabb{
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::max(),
-          std::numeric_limits<float>::max(),    std::numeric_limits<float>::lowest(),
-          std::numeric_limits<float>::lowest(), std::numeric_limits<float>::lowest()};
-    }
-
-    thrust::fill(rmm::exec_policy_nosync(stream), aabbs_->begin(), aabbs_->end(),
-                 empty_aabb);
-
-    auto* p_aabbs = aabbs_->data();
-
-    rmm::device_uvector<INDEX_T> n_geoms_per_aabb(n_aabbs, stream);
-
-    auto* p_reordered_indices = reordered_indices_->data();
-    auto* p_n_geoms_per_aabb = n_geoms_per_aabb.data();
-
-    // each warp takes an AABB and processes points_per_aabb points
-    LaunchKernel(stream, [=] __device__() mutable {
-      typedef cub::WarpReduce<scalar_t> WarpReduce;
-      __shared__ typename WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-      auto warp_id = threadIdx.x / 32;
-      auto lane_id = threadIdx.x % 32;
-      auto global_warp_id = TID_1D / 32;
-      auto n_warps = TOTAL_THREADS_1D / 32;
-
-      for (uint32_t aabb_id = global_warp_id; aabb_id < n_aabbs; aabb_id += n_warps) {
-        POINT_T min_corner, max_corner;
-        size_t idx_begin = aabb_id * geoms_per_aabb;
-        size_t idx_end = std::min((size_t)geometries.size(), idx_begin + geoms_per_aabb);
-        size_t idx_end_rup = (idx_end + 31) / 32;
-        idx_end_rup *= 32;  // round up to the next multiple of 32
-
-        p_n_geoms_per_aabb[aabb_id] = idx_end - idx_begin;
-
-        for (auto idx = idx_begin + lane_id; idx < idx_end_rup; idx += 32) {
-          Box<Point<float, POINT_T::n_dim>> mbr;
-
-          auto warp_begin = idx - lane_id;
-          auto warp_end = std::min(warp_begin + 32, idx_end);
-          auto n_valid = warp_end - warp_begin;
-
-          if (idx < idx_end) {
-            auto geom_idx = p_reordered_indices[idx];
-            mbr = geometries[geom_idx].get_mbr();
-          }
-
-          for (int dim = 0; dim < n_dim; dim++) {
-            auto min_val =
-                WarpReduce(temp_storage[warp_id])
-                    .Reduce(mbr.get_min(dim), thrust::minimum<scalar_t>(), n_valid);
-            if (lane_id == 0) {
-              min_corner.set_coordinate(dim, min_val);
-            }
-            auto max_val =
-                WarpReduce(temp_storage[warp_id])
-                    .Reduce(mbr.get_max(dim), thrust::maximum<scalar_t>(), n_valid);
-            if (lane_id == 0) {
-              max_corner.set_coordinate(dim, max_val);
-            }
-          }
-        }
-
-        if (lane_id == 0) {
-          box_t ext_mbr(min_corner, max_corner);
-          p_aabbs[aabb_id] = ext_mbr.ToOptixAabb();
-        }
-      }
-    });
-
-    prefix_sum_ = std::make_unique<rmm::device_uvector<INDEX_T>>(n_aabbs + 1, stream);
-    prefix_sum_->set_element_to_zero_async(0, stream);
-    thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_geoms_per_aabb.begin(),
-                           n_geoms_per_aabb.end(), prefix_sum_->begin() + 1);
-#ifndef NDEBUG
-    auto* p_prefix_sum = prefix_sum_->data();
-
-    thrust::for_each(rmm::exec_policy_nosync(stream),
-                     thrust::counting_iterator<size_t>(0),
-                     thrust::counting_iterator<size_t>(aabbs_->size()),
-                     [=] __device__(size_t aabb_idx) {
-                       auto begin = p_prefix_sum[aabb_idx];
-                       auto end = p_prefix_sum[aabb_idx + 1];
-                       const auto& aabb = p_aabbs[aabb_idx];
-
-                       for (auto i = begin; i < end; i++) {
-                         auto geom_idx = p_reordered_indices[i];
-                         auto mbr = geometries[geom_idx].get_mbr();
-                         assert(mbr.covered_by(aabb));
-                       }
-                     });
-#endif
-  }
-
-  ArrayView<OptixAabb> get_aabbs() const {
-    if (aabbs_ != nullptr) {
-      return ArrayView<OptixAabb>(aabbs_->data(), aabbs_->size());
-    }
-    return {};
-  }
-
-  ArrayView<INDEX_T> get_prefix_sum() const {
-    if (prefix_sum_ != nullptr) {
-      return ArrayView<INDEX_T>(prefix_sum_->data(), prefix_sum_->size());
-    }
-    return {};
-  }
-
-  ArrayView<INDEX_T> get_reordered_indices() const {
-    if (reordered_indices_ != nullptr) {
-      return ArrayView<INDEX_T>(reordered_indices_->data(), reordered_indices_->size());
-    }
-    return {};
-  }
-
-  void Clear() {
-    aabbs_ = nullptr;
-    prefix_sum_ = nullptr;
-    reordered_indices_ = nullptr;
-  }
-
- private:
-  std::unique_ptr<rmm::device_uvector<OptixAabb>> aabbs_;
-  std::unique_ptr<rmm::device_uvector<INDEX_T>> prefix_sum_;
-  std::unique_ptr<rmm::device_uvector<INDEX_T>> reordered_indices_;
-};
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp
deleted file mode 100644
index d0ab3e1ff..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/object_pool.hpp
+++ /dev/null
@@ -1,161 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include <memory>
-#include <mutex>
-#include <vector>
-
-namespace gpuspatial {
-// Forward declaration of ObjectPool to be used in the custom deleter.
-template <typename T>
-class ObjectPool;
-
-// A helper struct to allow std::make_shared to access the private constructor.
-// It inherits from ObjectPool and is defined outside of it.
-template <typename T>
-struct PoolEnabler : public ObjectPool<T> {
-  PoolEnabler(size_t size) : ObjectPool<T>(size) {}
-};
-
-// A custom deleter for std::shared_ptr.
-// When the shared_ptr's reference count goes to zero, this deleter
-// will be invoked, returning the object to the pool instead of deleting it.
-template <typename T>
-class PoolDeleter {
- public:
-  // Constructor takes a weak_ptr to the pool to avoid circular references.
-  PoolDeleter(std::weak_ptr<ObjectPool<T>> pool) : pool_(pool) {}
-
-  // The function call operator is what std::shared_ptr invokes.
-  void operator()(T* ptr) const {
-    // Attempt to lock the weak_ptr to get a shared_ptr to the pool.
-    if (auto pool_sp = pool_.lock()) {
-      // If the pool still exists, return the object to it.
-      pool_sp->release(ptr);
-    } else {
-      // If the pool no longer exists, we must delete the pointer to avoid a memory leak.
-      delete ptr;
-    }
-  }
-
- private:
-  std::weak_ptr<ObjectPool<T>> pool_;
-};
-
-/**
- * @brief A thread-safe object pool for reusable objects.
- *
- * @tparam T The type of object to pool.
- */
-template <typename T>
-class ObjectPool : public std::enable_shared_from_this<ObjectPool<T>> {
-  friend struct PoolEnabler<T>;
-
-  // Constructor is private to force object creation through the static 'create' method.
-  // This ensures the ObjectPool is always managed by a std::shared_ptr.
-  ObjectPool(size_t initial_size = 0) {
-    for (size_t i = 0; i < initial_size; ++i) {
-      pool_.push_back(new T());
-    }
-  }
-
- public:
-  /**
-   * @brief Factory method to create an instance of the ObjectPool.
-   * Guarantees that the pool is managed by a std::shared_ptr, which is required
-   * for the custom deleter mechanism to work correctly.
-   *
-   * @param initial_size The number of objects to pre-allocate.
-   * @return A std::shared_ptr to the new ObjectPool instance.
-   */
-  static std::shared_ptr<ObjectPool<T>> create(size_t initial_size = 0) {
-    return std::make_shared<PoolEnabler<T>>(initial_size);
-  }
-
-  /**
-   * @brief Destructor. Cleans up any remaining objects in the pool.
-   */
-  ~ObjectPool() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    for (T* item : pool_) {
-      delete item;
-    }
-    pool_.clear();
-  }
-
-  // Disable copy constructor and assignment operator
-  ObjectPool(const ObjectPool&) = delete;
-  ObjectPool& operator=(const ObjectPool&) = delete;
-
-  /**
-   * @brief Acquires an object from the pool.
-   *
-   * If the pool is empty, a new object is created. The returned shared_ptr
-   * has a custom deleter that will return the object to the pool when it's
-   * no longer referenced.
-   *
-   * @return A std::shared_ptr to an object of type T.
-   */
-  std::shared_ptr<T> take() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    T* resource_ptr = nullptr;
-    if (!pool_.empty()) {
-      // Take an existing object from the pool
-      resource_ptr = pool_.back();
-      pool_.pop_back();
-    } else {
-      // Pool is empty, create a new object
-      resource_ptr = new T();
-    }
-
-    // Create a custom deleter that knows how to return the object to this pool.
-    // this->shared_from_this() is now safe because creation is forced through the
-    // 'create' method.
-    PoolDeleter<T> deleter(this->shared_from_this());
-
-    // Return a shared_ptr with the custom deleter.
-    return std::shared_ptr<T>(resource_ptr, deleter);
-  }
-
-  /**
-   * @brief Returns an object to the pool.
-   *
-   * This method is intended to be called by the PoolDeleter, not directly by clients.
-   *
-   * @param object The raw pointer to the object to return to the pool.
-   */
-  void release(T* object) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    pool_.push_back(object);
-  }
-
-  /**
-   * @brief Gets the current number of available objects in the pool.
-   * @return The size of the pool.
-   */
-  size_t size() {
-    std::lock_guard<std::mutex> lock(mutex_);
-    return pool_.size();
-  }
-
- private:
-  std::vector<T*> pool_;
-  std::mutex mutex_;
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
new file mode 100644
index 000000000..4ae7036b6
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.cuh
@@ -0,0 +1,110 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/gpu_timer.hpp"
+#include "gpuspatial/utils/queue.h"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_uvector.hpp"
+#define GPUSPATIAL_PROFILING
+namespace gpuspatial {
+
+template <typename SCALAR_T, int N_DIM>
+class RTSpatialIndex : public SpatialIndex<SCALAR_T, N_DIM> {
+  using point_t = typename SpatialIndex<SCALAR_T, N_DIM>::point_t;
+  using box_t = typename SpatialIndex<SCALAR_T, N_DIM>::box_t;
+  using scalar_t = typename point_t::scalar_t;
+  static constexpr int n_dim = point_t::n_dim;
+
+  using index_t = uint32_t;  // type of the index to represent geometries
+  struct SpatialIndexContext {
+    rmm::cuda_stream_view stream;
+    std::string shader_id;
+    rmm::device_buffer bvh_buffer{0, rmm::cuda_stream_default};
+    OptixTraversableHandle handle;
+    std::vector<char> h_launch_params_buffer;
+    rmm::device_buffer launch_params_buffer{0, rmm::cuda_stream_default};
+    std::unique_ptr<rmm::device_scalar<uint32_t>> counter;
+    // output
+    Queue<index_t> build_indices;
+    rmm::device_uvector<index_t> probe_indices{0, rmm::cuda_stream_default};
+#ifdef GPUSPATIAL_PROFILING
+    GPUTimer timer;
+    // counters
+    double alloc_ms = 0.0;
+    double bvh_build_ms = 0.0;
+    double rt_ms = 0.0;
+    double copy_res_ms = 0.0;
+#endif
+  };
+
+ public:
+  RTSpatialIndex() = default;
+
+  RTSpatialIndex(const RTSpatialIndexConfig& config);
+
+  void Clear() override;
+
+  void PushBuild(const box_t* rects, uint32_t n_rects) override;
+
+  void FinishBuilding() override;
+
+  void Probe(const box_t* rects, uint32_t n_rects, std::vector<uint32_t>* build_indices,
+             std::vector<uint32_t>* probe_indices) override;
+
+ private:
+  RTSpatialIndexConfig config_;
+  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
+  bool indexing_points_;
+  // The rectangles being indexed or the MBRs of grouped points
+  rmm::device_uvector<box_t> rects_{0, rmm::cuda_stream_default};
+  // Data structures for indexing points
+  rmm::device_uvector<index_t> point_ranges_{0, rmm::cuda_stream_default};
+  rmm::device_uvector<index_t> reordered_point_indices_{0, rmm::cuda_stream_default};
+  rmm::device_uvector<point_t> points_{0, rmm::cuda_stream_default};
+  rmm::device_buffer bvh_buffer_{0, rmm::cuda_stream_default};
+  OptixTraversableHandle handle_;
+
+  void allocateResultBuffer(SpatialIndexContext& ctx, uint32_t capacity) const;
+
+  void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<point_t> points,
+                        bool counting) const;
+
+  void handleBuildPoint(SpatialIndexContext& ctx, ArrayView<box_t> rects,
+                        bool counting) const;
+
+  void handleBuildBox(SpatialIndexContext& ctx, ArrayView<point_t> points,
+                      bool counting) const;
+
+  void handleBuildBox(SpatialIndexContext& ctx, ArrayView<box_t> rects,
+                      bool counting) const;
+
+  void prepareLaunchParamsBoxQuery(SpatialIndexContext& ctx, ArrayView<box_t> probe_rects,
+                                   bool forward, bool counting) const;
+
+  void filter(SpatialIndexContext& ctx, uint32_t dim_x) const;
+
+  size_t numGeometries() const {
+    return indexing_points_ ? points_.size() : rects_.size();
+  }
+};
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
similarity index 53%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
index 6c836dfa9..b34475edd 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/rt_spatial_index.hpp
@@ -16,13 +16,31 @@
 // under the License.
 #pragma once
 
-#include "gpuspatial/index/streaming_joiner.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
 
 #include <memory>
+#include <thread>
 
 namespace gpuspatial {
-std::unique_ptr<StreamingJoiner> CreateSpatialJoiner();
 
-void InitSpatialJoiner(StreamingJoiner* index, const char* ptx_root,
-                       uint32_t concurrency);
+struct RTSpatialIndexConfig {
+  std::shared_ptr<RTEngine> rt_engine;
+  // Prefer fast build the BVH
+  bool prefer_fast_build = false;
+  // Compress the BVH to save memory
+  bool compact = true;
+  // How many threads are allowed to call PushProbe concurrently
+  uint32_t concurrency = 1;
+  // number of points to represent an AABB when doing point-point queries
+  uint32_t n_points_per_aabb = 8;
+  RTSpatialIndexConfig() : prefer_fast_build(false), compact(false) {
+    concurrency = std::thread::hardware_concurrency();
+  }
+};
+
+template <typename SCALAR_T, int N_DIM>
+std::unique_ptr<SpatialIndex<SCALAR_T, N_DIM>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp
new file mode 100644
index 000000000..4ea761e14
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_index.hpp
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/geom/box.cuh"
+#include "gpuspatial/geom/point.cuh"
+
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+namespace gpuspatial {
+template <typename SCALAR_T, int N_DIM>
+class SpatialIndex {
+ public:
+  using point_t = Point<SCALAR_T, N_DIM>;
+  using box_t = Box<point_t>;
+
+  virtual ~SpatialIndex() = default;
+
+  /**
+   * Provide an array of geometries to build the index.
+   * @param rects An array of rectangles to be indexed.
+   */
+  virtual void PushBuild(const box_t* rects, uint32_t n_rects) = 0;
+
+  /**
+   * Waiting the index to be built.
+   * This method should be called after all geometries have been pushed.
+   */
+  virtual void FinishBuilding() = 0;
+
+  /**
+   * Remove all geometries from the index, so the index can reused.
+   */
+  virtual void Clear() = 0;
+
+  /**
+   * Query the index with an array of rectangles and return the indices of
+   * the rectangles. This method is thread-safe.
+   * @param build_indices A vector to store the indices of the geometries in the index
+   * that have a spatial overlap with the geometries in the stream.
+   * @param stream_indices A vector to store the indices of the geometries in the stream
+   * that have a spatial overlap with the geometries in the index.
+   */
+  virtual void Probe(const box_t* rects, uint32_t n_rects,
+                     std::vector<uint32_t>* build_indices,
+                     std::vector<uint32_t>* stream_indices) {
+    throw std::runtime_error("Not implemented");
+  }
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh
deleted file mode 100644
index 1c93a54b2..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/spatial_joiner.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "geoarrow/geoarrow_type.h"
-#include "gpuspatial/geom/box.cuh"
-#include "gpuspatial/geom/point.cuh"
-#include "gpuspatial/index/detail/rt_engine.hpp"
-#include "gpuspatial/index/geometry_grouper.hpp"
-#include "gpuspatial/index/object_pool.hpp"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/index/streaming_joiner.hpp"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/utils/gpu_timer.hpp"
-#include "gpuspatial/utils/queue.h"
-#include "gpuspatial/utils/thread_pool.h"
-
-#include "rmm/cuda_stream_pool.hpp"
-#include "rmm/cuda_stream_view.hpp"
-#include "rmm/device_uvector.hpp"
-
-#include <fstream>
-#include <thread>
-
-
-// #define GPUSPATIAL_PROFILING
-namespace gpuspatial {
-
-class SpatialJoiner : public StreamingJoiner {
-  // TODO: Assuming every thing is 2D in double for now
-  using scalar_t = double;
-  static constexpr int n_dim = 2;
-  using index_t = uint32_t;  // type of the index to represent geometries
-  // geometry types
-  using point_t = Point<scalar_t, n_dim>;
-  using multi_point_t = MultiPoint<point_t>;
-  using line_string_t = LineString<point_t>;
-  using multi_line_string_t = MultiLineString<point_t, index_t>;
-  using polygon_t = Polygon<point_t, index_t>;
-  using multi_polygon_t = MultiPolygon<point_t, index_t>;
-  // geometry array types
-  using point_array_t = PointArrayView<point_t, index_t>;
-  using multi_point_array_t = MultiPointArrayView<point_t, index_t>;
-  using line_string_array_t = LineStringArrayView<point_t, index_t>;
-  using multi_line_string_array_t = MultiLineStringArrayView<point_t, index_t>;
-  using polygon_array_t = PolygonArrayView<point_t, index_t>;
-  using multi_polygon_array_t = MultiPolygonArrayView<point_t, index_t>;
-
-  using dev_geometries_t = DeviceGeometries<point_t, index_t>;
-  using box_t = Box<Point<float, n_dim>>;
-  using loader_t = ParallelWkbLoader<point_t, index_t>;
-
- public:
-  struct SpatialJoinerConfig : Config {
-    const char* ptx_root;
-    // Prefer fast build the BVH
-    bool prefer_fast_build = false;
-    // Compress the BVH to save memory
-    bool compact = true;
-    // Loader configurations
-    // How many threads to use for parsing WKBs
-    uint32_t parsing_threads = std::thread::hardware_concurrency();
-    // How many threads are allowed to call PushStream concurrently
-    uint32_t concurrency = 1;
-    // number of points to represent an AABB when doing point-point queries
-    uint32_t n_points_per_aabb = 8;
-    // reserve a ratio of available memory for result sets
-    float result_buffer_memory_reserve_ratio = 0.2;
-    // the memory quota for relate engine compared to the available memory
-    float relate_engine_memory_quota = 0.8;
-    // this value determines RELATE_MAX_DEPTH
-    size_t stack_size_bytes = 3 * 1024;
-    SpatialJoinerConfig() : ptx_root(nullptr), prefer_fast_build(false), compact(false) {
-      concurrency = std::thread::hardware_concurrency();
-    }
-  };
-
-  struct SpatialJoinerContext : Context {
-    rmm::cuda_stream_view cuda_stream;
-    std::string shader_id;
-    std::unique_ptr<loader_t> stream_loader;
-    dev_geometries_t stream_geometries;
-    std::unique_ptr<rmm::device_buffer> bvh_buffer;
-    OptixTraversableHandle handle;
-    std::vector<char> h_launch_params_buffer;
-    std::unique_ptr<rmm::device_buffer> launch_params_buffer;
-    // output
-    Queue<thrust::pair<index_t, index_t>> results;
-    int32_t array_index_offset;
-#ifdef GPUSPATIAL_PROFILING
-    GPUTimer timer;
-    // counters
-    double parse_ms = 0.0;
-    double alloc_ms = 0.0;
-    double filter_ms = 0.0;
-    double refine_ms = 0.0;
-    double copy_res_ms = 0.0;
-#endif
-  };
-
-  SpatialJoiner() = default;
-
-  ~SpatialJoiner() = default;
-
-  void Init(const Config* config) override;
-
-  void Clear() override;
-
-  void PushBuild(const ArrowSchema* schema, const ArrowArray* array, int64_t offset,
-                 int64_t length) override;
-
-  void FinishBuilding() override;
-
-  std::shared_ptr<Context> CreateContext() override { return ctx_pool_->take(); }
-
-  void PushStream(Context* ctx, const ArrowSchema* schema, const ArrowArray* array,
-                  int64_t offset, int64_t length, Predicate predicate,
-                  std::vector<uint32_t>* build_indices,
-                  std::vector<uint32_t>* stream_indices,
-                  int32_t array_index_offset) override;
-
-  // Internal method but has to be public for the CUDA kernel to access
-  void handleBuildPointStreamPoint(SpatialJoinerContext* ctx, Predicate predicate,
-                                   std::vector<uint32_t>* build_indices,
-                                   std::vector<uint32_t>* stream_indices);
-
-  void handleBuildBoxStreamPoint(SpatialJoinerContext* ctx, Predicate predicate,
-                                 std::vector<uint32_t>* build_indices,
-                                 std::vector<uint32_t>* stream_indices);
-
-  void handleBuildPointStreamBox(SpatialJoinerContext* ctx, Predicate predicate,
-                                 std::vector<uint32_t>* build_indices,
-                                 std::vector<uint32_t>* stream_indices);
-
-  void handleBuildBoxStreamBox(SpatialJoinerContext* ctx, Predicate predicate,
-                               std::vector<uint32_t>* build_indices,
-                               std::vector<uint32_t>* stream_indices);
-
-  void filter(SpatialJoinerContext* ctx, uint32_t dim_x, bool swap_id = false);
-
-  void refine(SpatialJoinerContext* ctx, Predicate predicate,
-              std::vector<uint32_t>* build_indices,
-              std::vector<uint32_t>* stream_indices);
-
- private:
-  SpatialJoinerConfig config_;
-  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
-  std::shared_ptr<ThreadPool> thread_pool_;
-  details::RTEngine rt_engine_;
-  std::unique_ptr<rmm::device_buffer> bvh_buffer_;
-  std::unique_ptr<loader_t> build_loader_;
-
-  DeviceGeometries<point_t, index_t> build_geometries_;
-  // For grouping points with space-filing curve
-  GeometryGrouper<point_t, index_t> geometry_grouper_;
-  RelateEngine<point_t, index_t> relate_engine_;
-  OptixTraversableHandle handle_;
-
-  std::shared_ptr<ObjectPool<SpatialJoinerContext>> ctx_pool_;
-
-  OptixTraversableHandle buildBVH(const rmm::cuda_stream_view& stream,
-                                  const ArrayView<OptixAabb>& aabbs,
-                                  std::unique_ptr<rmm::device_buffer>& buffer);
-
-  void allocateResultBuffer(SpatialJoinerContext* ctx);
-
-  void prepareLaunchParamsBoxQuery(SpatialJoinerContext* ctx, bool forward);
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp
deleted file mode 100644
index ccf8a3bfe..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/streaming_joiner.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-#include "gpuspatial/relate/predicate.cuh"
-
-#include "nanoarrow/nanoarrow.hpp"
-
-#include <memory>
-#include <stdexcept>
-#include <vector>
-namespace gpuspatial {
-
-class StreamingJoiner {
- public:
-  struct Context {
-    virtual ~Context() = default;
-  };
-
-  struct Config {
-    virtual ~Config() = default;
-  };
-
-  virtual ~StreamingJoiner() = default;
-
-  /**
-   * Initialize the index with the given configuration. This method should be called only
-   * once before using the index.
-   * @param config
-   */
-  virtual void Init(const Config* config) = 0;
-
-  /**
-   * Provide an array of geometries to build the index.
-   * @param array ArrowArray that contains the geometries in WKB format.
-   * @param offset starting index of the ArrowArray
-   * @param length length of the ArrowArray to read.
-   */
-  virtual void PushBuild(const ArrowSchema* schema, const ArrowArray* array,
-                         int64_t offset, int64_t length) = 0;
-
-  /**
-   * Waiting the index to be built.
-   * This method should be called after all geometries have been pushed.
-   */
-  virtual void FinishBuilding() = 0;
-
-  /**
-   * Remove all geometries from the index, so the index can reused.
-   */
-  virtual void Clear() = 0;
-
-  /**
-   * Query the index with an array of geometries in WKB format and return the indices of
-   * the geometries in stream and the index that satisfy a given predicate. This method is
-   * thread-safe.
-   * @param context A context object that can be used to store intermediate results.
-   * @param array ArrowArray that contains the geometries in WKB format.
-   * @param offset starting index of the ArrowArray
-   * @param length length of the ArrowArray to read.
-   * @param predicate A predicate to filter the query results.
-   * @param build_indices A vector to store the indices of the geometries in the index
-   * that have a spatial overlap with the geometries in the stream.
-   * @param stream_indices A vector to store the indices of the geometries in the stream
-   * that have a spatial overlap with the geometries in the index.
-   * @param stream_index_offset An offset to be added to stream_indices
-   */
-  virtual void PushStream(Context* context, const ArrowSchema* schema,
-                          const ArrowArray* array, int64_t offset, int64_t length,
-                          Predicate predicate, std::vector<uint32_t>* build_indices,
-                          std::vector<uint32_t>* stream_indices,
-                          int32_t stream_index_offset) {
-    throw std::runtime_error("Not implemented");
-  }
-
-  /**
-   * Create a context object for issuing queries against the index.
-   * @return A context object that is used to store intermediate results.
-   */
-  virtual std::shared_ptr<Context> CreateContext() {
-    throw std::runtime_error("Not implemented");
-  }
-};
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h
index cb2186ff3..d6a111e6b 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/loader/parallel_wkb_loader.h
@@ -19,23 +19,30 @@
 #include "gpuspatial/geom/geometry_type.cuh"
 #include "gpuspatial/loader/device_geometries.cuh"
 #include "gpuspatial/utils/logger.hpp"
+#include "gpuspatial/utils/markers.h"
 #include "gpuspatial/utils/mem_utils.hpp"
 #include "gpuspatial/utils/stopwatch.h"
 #include "gpuspatial/utils/thread_pool.h"
 
-#include "nanoarrow/nanoarrow.h"
+#include "nanoarrow/nanoarrow.hpp"
+
+#include "geoarrow/geoarrow.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 #include "rmm/device_uvector.hpp"
 #include "rmm/exec_policy.hpp"
 
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
 
+#include <sys/sysinfo.h>
+
+#include <cstring>
+#include <future>
+#include <numeric>
 #include <thread>
 #include <unordered_set>
-
-#include <sys/sysinfo.h>
-#include <unistd.h>
+#include <vector>
 
 namespace gpuspatial {
 namespace detail {
@@ -43,50 +50,14 @@ namespace detail {
 inline long long get_free_physical_memory_linux() {
   struct sysinfo info;
   if (sysinfo(&info) == 0) {
-    // info.freeram is in bytes (or unit defined by info.mem_unit)
-    // Use info.freeram * info.mem_unit for total free bytes
     return (long long)info.freeram * (long long)info.mem_unit;
   }
   return 0;  // Error
 }
 
-// Copied from GeoArrow, it is faster than using GeoArrowWKBReaderRead
-struct WKBReaderPrivate {
-  const uint8_t* data;
-  int64_t size_bytes;
-  const uint8_t* data0;
-  int need_swapping;
-  GeoArrowGeometry geom;
-};
-
-static int WKBReaderReadEndian(struct WKBReaderPrivate* s, struct GeoArrowError* error) {
-  if (s->size_bytes > 0) {
-    s->need_swapping = s->data[0] != GEOARROW_NATIVE_ENDIAN;
-    s->data++;
-    s->size_bytes--;
-    return GEOARROW_OK;
-  } else {
-    GeoArrowErrorSet(error, "Expected endian byte but found end of buffer at byte %ld",
-                     (long)(s->data - s->data0));
-    return EINVAL;
-  }
-}
-
-static int WKBReaderReadUInt32(struct WKBReaderPrivate* s, uint32_t* out,
-                               struct GeoArrowError* error) {
-  if (s->size_bytes >= 4) {
-    memcpy(out, s->data, sizeof(uint32_t));
-    s->data += sizeof(uint32_t);
-    s->size_bytes -= sizeof(uint32_t);
-    if (s->need_swapping) {
-      *out = __builtin_bswap32(*out);
-    }
-    return GEOARROW_OK;
-  } else {
-    GeoArrowErrorSet(error, "Expected uint32 but found end of buffer at byte %ld",
-                     (long)(s->data - s->data0));
-    return EINVAL;
-  }
+inline bool is_little_endian() {
+  const uint16_t x = 0x0001;
+  return *reinterpret_cast<const uint8_t*>(&x) != 0;
 }
 
 /**
@@ -105,6 +76,7 @@ template <typename POINT_T, typename INDEX_T>
 struct HostParsedGeometries {
   constexpr static int n_dim = POINT_T::n_dim;
   using mbr_t = Box<Point<float, n_dim>>;
+  GeometryType type;  // A general type that can reprs
   // each feature should have only one type except GeometryCollection
   std::vector<GeometryType> feature_types;
   // This number should be one except GeometryCollection, which should be unnested # of
@@ -120,17 +92,18 @@ struct HostParsedGeometries {
   bool has_geometry_collection = false;
   bool create_mbr = false;
 
-  HostParsedGeometries(bool multi_, bool has_geometry_collection_, bool create_mbr_) {
+  HostParsedGeometries(GeometryType t) : type(t) {
+    multi = type == GeometryType::kMultiPoint || type == GeometryType::kMultiLineString ||
+            type == GeometryType::kMultiPolygon;
+    has_geometry_collection = type == GeometryType::kGeometryCollection;
+    create_mbr = type != GeometryType::kPoint;
     // Multi and GeometryCollection are mutually exclusive
-    assert(!(multi_ && has_geometry_collection_));
-    multi = multi_;
-    has_geometry_collection = has_geometry_collection_;
-    create_mbr = create_mbr_;
+    assert(!(multi && has_geometry_collection));
   }
 
   void AddGeometry(const GeoArrowGeometryView* geom) {
     if (geom == nullptr) {
-      throw std::runtime_error("Null geometry not supported yet");
+      addNullEntry();
       return;
     }
 
@@ -405,6 +378,49 @@ struct HostParsedGeometries {
     }
     return node + 1;
   }
+
+  void addNullEntry() {
+    // 1. Maintain MBR alignment if this type has MBRs
+    if (create_mbr) {
+      mbr_t empty_mbr;
+      empty_mbr.set_empty();
+      mbrs.push_back(empty_mbr);
+    }
+
+    // 2. Push zero-placeholders to maintain offset alignment
+    if (has_geometry_collection) {
+      // Null collection => 0 sub-geometries
+      num_geoms.push_back(0);
+    } else {
+      switch (type) {
+        case GeometryType::kPoint: {
+          // Push NaN point to represent empty/null
+          POINT_T p;
+          p.set_empty();
+          vertices.push_back(p);
+          break;
+        }
+        case GeometryType::kLineString:
+          num_points.push_back(0);
+          break;
+        case GeometryType::kPolygon:
+          num_rings.push_back(0);
+          break;
+        case GeometryType::kMultiPoint:
+          num_points.push_back(0);
+          break;
+        case GeometryType::kMultiLineString:
+          num_parts.push_back(0);
+          break;
+        case GeometryType::kMultiPolygon:
+          num_parts.push_back(0);
+          break;
+        default:
+          throw std::runtime_error(
+              "Null geometry encountered for unsupported geometry type");
+      }
+    }
+  }
 };
 
 template <typename POINT_T, typename INDEX_T>
@@ -442,7 +458,8 @@ struct DeviceParsedGeometries {
   }
 
   void Append(rmm::cuda_stream_view stream,
-              const std::vector<HostParsedGeometries<POINT_T, INDEX_T>>& host_geoms) {
+              const std::vector<HostParsedGeometries<POINT_T, INDEX_T>>& host_geoms,
+              double& t_alloc_ms, double& t_copy_ms) {
     size_t sz_feature_types = 0;
     size_t sz_num_geoms = 0;
     size_t sz_num_parts = 0;
@@ -482,6 +499,9 @@ struct DeviceParsedGeometries {
         prev_sz_mbrs * sizeof(mbr_t) / 1024 / 1024,
         sz_mbrs * sizeof(mbr_t) / 1024 / 1024);
 
+    Stopwatch sw;
+
+    sw.start();
     feature_types.resize(feature_types.size() + sz_feature_types, stream);
     num_geoms.resize(num_geoms.size() + sz_num_geoms, stream);
     num_parts.resize(num_parts.size() + sz_num_parts, stream);
@@ -489,7 +509,11 @@ struct DeviceParsedGeometries {
     num_points.resize(num_points.size() + sz_num_points, stream);
     vertices.resize(vertices.size() + sz_vertices, stream);
     mbrs.resize(mbrs.size() + sz_mbrs, stream);
-
+    stream.synchronize();
+    sw.stop();
+    t_alloc_ms += sw.ms();
+    Instrument::Range r("H2D", gpuspatial::Color::Blue);
+    sw.start();
     for (auto& geoms : host_geoms) {
       detail::async_copy_h2d(stream, geoms.feature_types.data(),
                              feature_types.data() + prev_sz_feature_types,
@@ -518,6 +542,9 @@ struct DeviceParsedGeometries {
       prev_sz_vertices += geoms.vertices.size();
       prev_sz_mbrs += geoms.mbrs.size();
     }
+    stream.synchronize();
+    sw.stop();
+    t_copy_ms += sw.ms();
   }
 };
 }  // namespace detail
@@ -531,9 +558,7 @@ class ParallelWkbLoader {
 
  public:
   struct Config {
-    // How many rows of WKBs to process in one chunk
-    // This value affects the peak memory usage and overheads
-    int chunk_size = 16 * 1024;
+    float memory_quota = 0.8f;  // percentage of free memory to use
   };
 
   ParallelWkbLoader()
@@ -543,9 +568,8 @@ class ParallelWkbLoader {
       : thread_pool_(thread_pool) {}
 
   void Init(const Config& config = Config()) {
-    ArrowArrayViewInitFromType(&array_view_, NANOARROW_TYPE_BINARY);
     config_ = config;
-    geometry_type_ = GeometryType::kNull;
+    Clear(rmm::cuda_stream_default);
   }
 
   void Clear(rmm::cuda_stream_view stream) {
@@ -553,72 +577,97 @@ class ParallelWkbLoader {
     geoms_.Clear(stream);
   }
 
-  void Parse(rmm::cuda_stream_view stream, const ArrowArray* array, int64_t offset,
-             int64_t length) {
-    using host_geometries_t = detail::HostParsedGeometries<POINT_T, INDEX_T>;
+  void Parse(rmm::cuda_stream_view stream, const ArrowSchema* schema,
+             const ArrowArray* array, int64_t offset, int64_t length) {
+    auto begin = thrust::make_counting_iterator<int64_t>(offset);
+    auto end = begin + length;
+
+    Parse(stream, schema, array, begin, end);
+  }
+
+  template <typename OFFSET_IT>
+  void Parse(rmm::cuda_stream_view stream, const ArrowSchema* schema,
+             const ArrowArray* array, OFFSET_IT begin, OFFSET_IT end) {
     ArrowError arrow_error;
-    if (ArrowArrayViewSetArray(&array_view_, array, &arrow_error) != NANOARROW_OK) {
+
+    if (ArrowArrayViewInitFromSchema(array_view_.get(), schema, &arrow_error) !=
+        NANOARROW_OK) {
+      throw std::runtime_error("ArrowArrayViewInitFromSchema error " +
+                               std::string(arrow_error.message));
+    }
+    using host_geometries_t = detail::HostParsedGeometries<POINT_T, INDEX_T>;
+
+    size_t num_offsets = std::distance(begin, end);
+    if (num_offsets == 0) return;
+
+    if (ArrowArrayViewSetArray(array_view_.get(), array, &arrow_error) != NANOARROW_OK) {
       throw std::runtime_error("ArrowArrayViewSetArray error " +
                                std::string(arrow_error.message));
     }
+
     auto parallelism = thread_pool_->num_threads();
-    auto est_bytes = estimateTotalBytes(array, offset, length);
-    auto free_memory = detail::get_free_physical_memory_linux();
+    uint64_t est_bytes = estimateTotalBytes(begin, end);
+
+    uint64_t free_memory = detail::get_free_physical_memory_linux();
+    uint64_t memory_quota = free_memory * config_.memory_quota;
     uint32_t est_n_chunks = est_bytes / free_memory + 1;
-    uint32_t chunk_size = (length + est_n_chunks - 1) / est_n_chunks;
+
+    // Use num_offsets instead of offsets.size()
+    uint32_t chunk_size = (num_offsets + est_n_chunks - 1) / est_n_chunks;
+    uint32_t n_chunks = (num_offsets + chunk_size - 1) / chunk_size;
 
     GPUSPATIAL_LOG_INFO(
-        "Parsing %ld rows, est arrow size %ld MB, free memory %lld, chunk size %u\n",
-        length, est_bytes / 1024 / 1024, free_memory / 1024 / 1024, chunk_size);
+        "Parsing %zu rows, est ArrowArray size %lu MB, Free Host Memory %lu MB, Memory quota %lu MB, Chunk Size %u, Total Chunks %u",
+        num_offsets, est_bytes / 1024 / 1024, free_memory / 1024 / 1024,
+        memory_quota / 1024 / 1024, chunk_size, n_chunks);
 
-    auto n_chunks = (length + chunk_size - 1) / chunk_size;
     Stopwatch sw;
     double t_fetch_type = 0, t_parse = 0, t_copy = 0;
+    double t_alloc = 0, t_h2d = 0;
 
     sw.start();
-    updateGeometryType(offset, length);
+    // Assumption: updateGeometryType is updated to accept iterators (begin, end)
+    updateGeometryType(begin, end);
     sw.stop();
     t_fetch_type = sw.ms();
 
-    bool multi = geometry_type_ == GeometryType::kMultiPoint ||
-                 geometry_type_ == GeometryType::kMultiLineString ||
-                 geometry_type_ == GeometryType::kMultiPolygon;
-    bool has_geometry_collection = geometry_type_ == GeometryType::kGeometryCollection;
-    bool create_mbr = geometry_type_ != GeometryType::kPoint;
-
     // reserve space
     geoms_.vertices.reserve(est_bytes / sizeof(POINT_T), stream);
-    if (create_mbr) geoms_.mbrs.reserve(array->length, stream);
+    if (geometry_type_ != GeometryType::kPoint)
+      geoms_.mbrs.reserve(array->length, stream);
 
     // Batch processing to reduce the peak memory usage
-    for (int64_t chunk = 0; chunk < n_chunks; chunk++) {
+    for (size_t chunk = 0; chunk < n_chunks; chunk++) {
       auto chunk_start = chunk * chunk_size;
-      auto chunk_end = std::min(length, (chunk + 1) * chunk_size);
-      auto work_size = chunk_end - chunk_start;
+      auto chunk_end = std::min(num_offsets, (chunk + 1) * chunk_size);
+      auto split_points =
+          assignBalancedWorks(begin + chunk_start, begin + chunk_end, parallelism);
 
       std::vector<std::future<host_geometries_t>> pending_local_geoms;
-      auto thread_work_size = (work_size + parallelism - 1) / parallelism;
-      sw.start();
       // Each thread will parse in parallel and store results sequentially
       for (int thread_idx = 0; thread_idx < parallelism; thread_idx++) {
         auto run = [&](int tid) {
-          // FIXME: SetDevice
-          auto thread_work_start = chunk_start + tid * thread_work_size;
-          auto thread_work_end =
-              std::min(chunk_end, thread_work_start + thread_work_size);
-          host_geometries_t local_geoms(multi, has_geometry_collection, create_mbr);
+          auto thread_work_start = split_points[tid];
+          auto thread_work_end = split_points[tid + 1];
+          host_geometries_t local_geoms(geometry_type_);
           GeoArrowWKBReader reader;
           GeoArrowError error;
-          GEOARROW_THROW_NOT_OK(nullptr, GeoArrowWKBReaderInit(&reader));
+          GEOARROW_THROW_NOT_OK(&error, GeoArrowWKBReaderInit(&reader));
+
+          uint64_t chunk_bytes =
+              estimateTotalBytes(begin + thread_work_start, begin + thread_work_end);
+          local_geoms.vertices.reserve(chunk_bytes / sizeof(POINT_T));
 
           for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
                work_offset++) {
-            auto arrow_offset = work_offset + offset;
+            // Use iterator indexing (Requires RandomAccessIterator)
+            auto arrow_offset = begin[chunk_start + work_offset];
+
             // handle null value
-            if (ArrowArrayViewIsNull(&array_view_, arrow_offset)) {
+            if (ArrowArrayViewIsNull(array_view_.get(), arrow_offset)) {
               local_geoms.AddGeometry(nullptr);
             } else {
-              auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, arrow_offset);
+              auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), arrow_offset);
               GeoArrowGeometryView geom;
 
               GEOARROW_THROW_NOT_OK(
@@ -629,6 +678,7 @@ class ParallelWkbLoader {
             }
           }
 
+          GeoArrowWKBReaderReset(&reader);
           return std::move(local_geoms);
         };
         pending_local_geoms.push_back(std::move(thread_pool_->enqueue(run, thread_idx)));
@@ -641,15 +691,14 @@ class ParallelWkbLoader {
       sw.stop();
       t_parse += sw.ms();
       sw.start();
-      geoms_.Append(stream, local_geoms);
+      geoms_.Append(stream, local_geoms, t_alloc, t_h2d);
       stream.synchronize();
       sw.stop();
       t_copy += sw.ms();
     }
     GPUSPATIAL_LOG_INFO(
-        "ParallelWkbLoader::Parse: fetched type in %.3f ms, parsed in %.3f ms, copied in "
-        "%.3f ms",
-        t_fetch_type, t_parse, t_copy);
+        "ParallelWkbLoader::Parse: fetched type in %.3f ms, parsed in %.3f ms, alloc %.3f ms, h2d copy %.3f ms",
+        t_fetch_type, t_parse, t_alloc, t_h2d);
   }
 
   DeviceGeometries<POINT_T, INDEX_T> Finish(rmm::cuda_stream_view stream) {
@@ -746,8 +795,10 @@ class ParallelWkbLoader {
             std::move(ps_num_points);
         break;
       }
+      default:
+        throw std::runtime_error("Unsupported geometry type " +
+                                 GeometryTypeToString(geometry_type_) + " in Finish");
     }
-    Clear(stream);
     stream.synchronize();
     sw.stop();
     GPUSPATIAL_LOG_INFO("Finish building DeviceGeometries in %.3f ms", sw.ms());
@@ -756,102 +807,99 @@ class ParallelWkbLoader {
 
  private:
   Config config_;
-  ArrowArrayView array_view_;
+  nanoarrow::UniqueArrayView array_view_;
   GeometryType geometry_type_;
   detail::DeviceParsedGeometries<POINT_T, INDEX_T> geoms_;
   std::shared_ptr<ThreadPool> thread_pool_;
 
-  void updateGeometryType(int64_t offset, int64_t length) {
+  template <typename OFFSET_IT>
+  void updateGeometryType(OFFSET_IT begin, OFFSET_IT end) {
     if (geometry_type_ == GeometryType::kGeometryCollection) {
-      // it's already the most generic type
       return;
     }
 
-    std::vector<bool> type_flags(8 /*WKB types*/, false);
-    std::vector<std::thread> workers;
+    size_t num_offsets = std::distance(begin, end);
+    if (num_offsets == 0) return;
+
     auto parallelism = thread_pool_->num_threads();
-    auto thread_work_size = (length + parallelism - 1) / parallelism;
-    std::vector<std::future<void>> futures;
+    auto thread_work_size = (num_offsets + parallelism - 1) / parallelism;
+
+    std::vector<std::future<uint32_t>> futures;
+    futures.reserve(parallelism);
+
+    // Detect Endianness once (outside the loop)
+    const bool host_is_little = detail::is_little_endian();
 
     for (int thread_idx = 0; thread_idx < parallelism; thread_idx++) {
-      auto run = [&](int tid) {
-        auto thread_work_start = tid * thread_work_size;
-        auto thread_work_end = std::min(length, thread_work_start + thread_work_size);
-        GeoArrowWKBReader reader;
-        GeoArrowError error;
-        GEOARROW_THROW_NOT_OK(nullptr, GeoArrowWKBReaderInit(&reader));
+      auto run = [=](int tid) -> uint32_t {
+        size_t thread_work_start = tid * thread_work_size;
+        size_t thread_work_end =
+            std::min(num_offsets, thread_work_start + thread_work_size);
+
+        uint32_t local_seen_mask = 0;
 
         for (uint32_t work_offset = thread_work_start; work_offset < thread_work_end;
              work_offset++) {
-          auto arrow_offset = work_offset + offset;
-          // handle null value
-          if (ArrowArrayViewIsNull(&array_view_, arrow_offset)) {
+          auto arrow_offset = begin[work_offset];
+
+          if (ArrowArrayViewIsNull(array_view_.get(), arrow_offset)) {
             continue;
           }
-          auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, arrow_offset);
-          auto* s = (struct detail::WKBReaderPrivate*)reader.private_data;
 
-          s->data = item.data.as_uint8;
-          s->data0 = s->data;
-          s->size_bytes = item.size_bytes;
+          auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), arrow_offset);
+
+          // Safety check: WKB minimal size is 5 bytes (1 byte order + 4 type)
+          if (item.size_bytes < 5) continue;
+
+          const uint8_t* data = item.data.as_uint8;
+
+          // 1. Read Endianness Byte (0 = Big/XDR, 1 = Little/NDR)
+          uint8_t wkb_endian = data[0];
 
-          NANOARROW_THROW_NOT_OK(detail::WKBReaderReadEndian(s, &error));
+          // 2. Read Type (Bytes 1-4)
           uint32_t geometry_type;
-          NANOARROW_THROW_NOT_OK(detail::WKBReaderReadUInt32(s, &geometry_type, &error));
+          std::memcpy(&geometry_type, data + 1, sizeof(uint32_t));
+
+          // 3. Swap if mismatch
+          // If (WKB is Little) != (Host is Little), we must swap
+          if ((wkb_endian == 1) != host_is_little) {
+            geometry_type = __builtin_bswap32(geometry_type);
+          }
+
+          // 4. Validate and Accumulate (Branchless Masking)
           if (geometry_type > 7) {
-            throw std::runtime_error(
-                "Extended WKB types are not currently supported, type = " +
-                std::to_string(geometry_type));
+            // It's safer to throw exception outside the tight loop or set an error flag
+            // For now, we skip or you can throw.
+            throw std::runtime_error("Extended WKB types not supported: " +
+                                     std::to_string(geometry_type));
           }
-          assert(geometry_type < type_flags.size());
-          type_flags[geometry_type] = true;
+
+          local_seen_mask |= (1 << geometry_type);
         }
+        return local_seen_mask;
       };
+
       futures.push_back(std::move(thread_pool_->enqueue(run, thread_idx)));
     }
+
+    // Reduction
+    uint32_t global_mask = 0;
     for (auto& fu : futures) {
-      fu.get();
+      global_mask |= fu.get();
     }
 
     std::unordered_set<GeometryType> types;
-    // include existing geometry type
     if (geometry_type_ != GeometryType::kNull) {
       types.insert(geometry_type_);
     }
 
     for (int i = 1; i <= 7; i++) {
-      if (type_flags[i]) {
+      if (global_mask & (1 << i)) {
         types.insert(static_cast<GeometryType>(i));
       }
     }
 
-    GeometryType final_type;
-    // Infer a generic type that can represent the current and previous types
-    switch (types.size()) {
-      case 0:
-        final_type = GeometryType::kNull;
-        break;
-      case 1:
-        final_type = *types.begin();
-        break;
-      case 2: {
-        if (types.count(GeometryType::kPoint) && types.count(GeometryType::kMultiPoint)) {
-          final_type = GeometryType::kMultiPoint;
-        } else if (types.count(GeometryType::kLineString) &&
-                   types.count(GeometryType::kMultiLineString)) {
-          final_type = GeometryType::kMultiLineString;
-        } else if (types.count(GeometryType::kPolygon) &&
-                   types.count(GeometryType::kMultiPolygon)) {
-          final_type = GeometryType::kMultiPolygon;
-        } else {
-          final_type = GeometryType::kGeometryCollection;
-        }
-        break;
-      }
-      default:
-        final_type = GeometryType::kGeometryCollection;
-    }
-    geometry_type_ = final_type;
+    geometry_type_ = getUpcastedGeometryType(types);
   }
 
   template <typename T>
@@ -875,21 +923,107 @@ class ParallelWkbLoader {
     nums.shrink_to_fit(stream);
   }
 
-  size_t estimateTotalBytes(const ArrowArray* array, int64_t offset, int64_t length) {
-    ArrowError arrow_error;
-    if (ArrowArrayViewSetArray(&array_view_, array, &arrow_error) != NANOARROW_OK) {
-      throw std::runtime_error("ArrowArrayViewSetArray error " +
-                               std::string(arrow_error.message));
-    }
+  template <typename OFFSET_IT>
+  size_t estimateTotalBytes(OFFSET_IT begin, OFFSET_IT end) const {
     size_t total_bytes = 0;
-    for (int64_t i = 0; i < length; i++) {
-      if (!ArrowArrayViewIsNull(&array_view_, offset + i)) {
-        auto item = ArrowArrayViewGetBytesUnsafe(&array_view_, offset + i);
+    for (auto it = begin; it != end; ++it) {
+      auto offset = *it;
+      if (!ArrowArrayViewIsNull(array_view_.get(), offset)) {
+        auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), offset);
         total_bytes += item.size_bytes - 1      // byte order
                        - 2 * sizeof(uint32_t);  // type + size
       }
     }
     return total_bytes;
   }
+
+  template <typename OFFSET_IT>
+  std::vector<uint32_t> assignBalancedWorks(OFFSET_IT begin, OFFSET_IT end,
+                                            uint32_t num_threads) const {
+    size_t total_bytes = 0;
+    std::vector<uint32_t> bytes_per_row;
+    size_t num_rows = std::distance(begin, end);
+
+    bytes_per_row.resize(num_rows, 0);
+
+    // 1. Calculate bytes per row
+    for (auto it = begin; it != end; ++it) {
+      auto offset = *it;
+      if (!ArrowArrayViewIsNull(array_view_.get(), offset)) {
+        auto item = ArrowArrayViewGetBytesUnsafe(array_view_.get(), offset);
+        // Assuming item.size_bytes fits in uint32_t based on vector definition
+        bytes_per_row[it - begin] = static_cast<uint32_t>(item.size_bytes);
+      }
+    }
+
+    // 2. Calculate prefix sum
+    // We use size_t (or uint64_t) for the sum to prevent overflow
+    std::vector<size_t> prefix_sum;
+    prefix_sum.reserve(num_rows + 1);
+    prefix_sum.push_back(0);
+
+    for (uint32_t b : bytes_per_row) {
+      total_bytes += b;
+      prefix_sum.push_back(total_bytes);
+    }
+
+    // 3. Calculate balanced split points
+    std::vector<uint32_t> split_points;
+    split_points.reserve(num_threads + 1);
+    split_points.push_back(0);  // The start index for the first thread
+
+    // Avoid division by zero
+    if (num_threads > 0) {
+      double ideal_chunk_size = static_cast<double>(total_bytes) / num_threads;
+
+      for (uint32_t i = 1; i < num_threads; ++i) {
+        auto target_size = static_cast<size_t>(i * ideal_chunk_size);
+
+        // Find the first index where cumulative bytes >= target_size
+        auto it = std::lower_bound(prefix_sum.begin(), prefix_sum.end(), target_size);
+
+        // Convert iterator to index (row number)
+        auto split_index = static_cast<uint32_t>(std::distance(prefix_sum.begin(), it));
+        split_points.push_back(split_index);
+      }
+    }
+
+    // Ensure the last point is the total number of rows
+    // If num_threads was 0, this will be the second element (0, num_rows)
+    split_points.push_back(static_cast<uint32_t>(num_rows));
+
+    return split_points;
+  }
+
+  GeometryType getUpcastedGeometryType(
+      const std::unordered_set<GeometryType>& types) const {
+    GeometryType final_type;
+    // Infer a generic type that can represent the current and previous types
+    switch (types.size()) {
+      case 0:
+        final_type = GeometryType::kNull;
+        break;
+      case 1:
+        final_type = *types.begin();
+        break;
+      case 2: {
+        if (types.count(GeometryType::kPoint) && types.count(GeometryType::kMultiPoint)) {
+          final_type = GeometryType::kMultiPoint;
+        } else if (types.count(GeometryType::kLineString) &&
+                   types.count(GeometryType::kMultiLineString)) {
+          final_type = GeometryType::kMultiLineString;
+        } else if (types.count(GeometryType::kPolygon) &&
+                   types.count(GeometryType::kMultiPolygon)) {
+          final_type = GeometryType::kMultiPolygon;
+        } else {
+          final_type = GeometryType::kGeometryCollection;
+        }
+        break;
+      }
+      default:
+        final_type = GeometryType::kGeometryCollection;
+    }
+    return final_type;
+  }
 };
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
new file mode 100644
index 000000000..a2173f1ae
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.cuh
@@ -0,0 +1,124 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/geom/box.cuh"
+#include "gpuspatial/geom/point.cuh"
+#include "gpuspatial/loader/device_geometries.cuh"
+#include "gpuspatial/loader/parallel_wkb_loader.h"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "gpuspatial/refine/spatial_refiner.hpp"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/gpu_timer.hpp"
+#include "gpuspatial/utils/thread_pool.h"
+
+#include "geoarrow/geoarrow_type.h"
+#include "nanoarrow/nanoarrow.h"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/cuda_stream_view.hpp"
+
+#include <thread>
+
+#define GPUSPATIAL_PROFILING
+namespace gpuspatial {
+
+class RTSpatialRefiner : public SpatialRefiner {
+  // TODO: Assuming every thing is 2D in double for now
+  using scalar_t = double;
+  static constexpr int n_dim = 2;
+  using index_t = uint32_t;  // type of the index to represent geometries
+  // geometry types
+  using point_t = Point<scalar_t, n_dim>;
+  using multi_point_t = MultiPoint<point_t>;
+  using line_string_t = LineString<point_t>;
+  using multi_line_string_t = MultiLineString<point_t, index_t>;
+  using polygon_t = Polygon<point_t, index_t>;
+  using multi_polygon_t = MultiPolygon<point_t, index_t>;
+  // geometry array types
+  using point_array_t = PointArrayView<point_t, index_t>;
+  using multi_point_array_t = MultiPointArrayView<point_t, index_t>;
+  using line_string_array_t = LineStringArrayView<point_t, index_t>;
+  using multi_line_string_array_t = MultiLineStringArrayView<point_t, index_t>;
+  using polygon_array_t = PolygonArrayView<point_t, index_t>;
+  using multi_polygon_array_t = MultiPolygonArrayView<point_t, index_t>;
+
+  using dev_geometries_t = DeviceGeometries<point_t, index_t>;
+  using box_t = Box<Point<float, n_dim>>;
+  using loader_t = ParallelWkbLoader<point_t, index_t>;
+
+  static_assert(sizeof(Box<Point<float, 2>>) == sizeof(box_t),
+                "Box<Point<float, 2>> size mismatch!");
+
+ public:
+  struct IndicesMap {
+    // Sorted unique original indices
+    std::vector<uint32_t> h_uniq_indices;
+    rmm::device_uvector<uint32_t> d_uniq_indices{0, rmm::cuda_stream_default};
+    // Mapping from original indices to consecutive zero-based indices
+    rmm::device_uvector<uint32_t> d_reordered_indices{0, rmm::cuda_stream_default};
+  };
+  struct SpatialRefinerContext {
+    rmm::cuda_stream_view cuda_stream;
+#ifdef GPUSPATIAL_PROFILING
+    GPUTimer timer;
+    // counters
+    double parse_ms = 0.0;
+    double alloc_ms = 0.0;
+    double refine_ms = 0.0;
+    double copy_res_ms = 0.0;
+#endif
+  };
+
+  RTSpatialRefiner() = default;
+
+  RTSpatialRefiner(const RTSpatialRefinerConfig& config);
+
+  ~RTSpatialRefiner() = default;
+
+  void Clear() override;
+
+  void PushBuild(const ArrowSchema* build_schema, const ArrowArray* build_array) override;
+
+  void FinishBuilding() override;
+
+  uint32_t Refine(const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                  Predicate predicate, uint32_t* build_indices, uint32_t* probe_indices,
+                  uint32_t len) override;
+
+  uint32_t Refine(const ArrowSchema* build_schema, const ArrowArray* build_array,
+                  const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                  Predicate predicate, uint32_t* build_indices, uint32_t* probe_indices,
+                  uint32_t len) override;
+
+  uint32_t RefinePipelined(const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                           Predicate predicate, uint32_t* build_indices,
+                           uint32_t* probe_indices, uint32_t len);
+
+ private:
+  RTSpatialRefinerConfig config_;
+  std::unique_ptr<rmm::cuda_stream_pool> stream_pool_;
+  std::shared_ptr<ThreadPool> thread_pool_;
+  std::unique_ptr<ParallelWkbLoader<point_t, index_t>> wkb_loader_;
+  dev_geometries_t build_geometries_;
+
+  template <typename INDEX_IT>
+  void buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                       INDEX_IT index_end, IndicesMap& indices_map) const;
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp
new file mode 100644
index 000000000..6b6978799
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/rt_spatial_refiner.hpp
@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+
+#include "gpuspatial/refine/spatial_refiner.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+
+#include <memory>
+
+namespace gpuspatial {
+
+struct RTSpatialRefinerConfig {
+  std::shared_ptr<RTEngine> rt_engine;
+  // Prefer fast build the BVH
+  bool prefer_fast_build = false;
+  // Compress the BVH to save memory
+  bool compact = true;
+  // Loader configurations
+  // How many threads to use for parsing WKBs
+  uint32_t parsing_threads = std::thread::hardware_concurrency();
+  // How many threads are allowed to call PushStream concurrently
+  uint32_t concurrency = 1;
+  // Overlapping parsing and refinement by pipelining multiple batches; 1 means no
+  // pipelining
+  uint32_t pipeline_batches = 1;
+  // the host memory quota for WKB parser compared to the available memory
+  float wkb_parser_memory_quota = 0.8;
+  // the device memory quota for relate engine compared to the available memory
+  float relate_engine_memory_quota = 0.8;
+  // this value determines RELATE_MAX_DEPTH
+  size_t stack_size_bytes = 3 * 1024;
+  bool sort_probe_indices = true;  // Sedona's spatial-join may require ordered output
+};
+
+std::unique_ptr<SpatialRefiner> CreateRTSpatialRefiner(
+    const RTSpatialRefinerConfig& config);
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp
new file mode 100644
index 000000000..60dd33451
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/refine/spatial_refiner.hpp
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#pragma once
+#include "gpuspatial/relate/predicate.cuh"
+
+#include "nanoarrow/nanoarrow.h"
+
+namespace gpuspatial {
+class SpatialRefiner {
+ public:
+  virtual ~SpatialRefiner() = default;
+
+  virtual void Clear() = 0;
+
+  virtual void PushBuild(const ArrowSchema* build_schema,
+                         const ArrowArray* build_array) = 0;
+
+  virtual void FinishBuilding() = 0;
+
+  virtual uint32_t Refine(const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                          Predicate predicate, uint32_t* build_indices,
+                          uint32_t* probe_indices, uint32_t len) = 0;
+
+  virtual uint32_t Refine(const ArrowSchema* build_schema, const ArrowArray* build_array,
+                          const ArrowSchema* probe_schema, const ArrowArray* probe_array,
+                          Predicate predicate, uint32_t* build_indices,
+                          uint32_t* probe_indices, uint32_t len) = 0;
+};
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
similarity index 67%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
index 5fb275078..a9518fb36 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/relate_engine.cuh
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/relate/relate_engine.cuh
@@ -15,10 +15,9 @@
 // specific language governing permissions and limitations
 // under the License.
 #pragma once
-#include "gpuspatial/index/detail/rt_engine.hpp"
 #include "gpuspatial/loader/device_geometries.cuh"
 #include "gpuspatial/relate/predicate.cuh"
-#include "gpuspatial/utils/queue.h"
+#include "gpuspatial/rt/rt_engine.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
 
@@ -31,8 +30,9 @@ class RelateEngine {
  public:
   struct Config {
     bool bvh_fast_build = false;
-    bool bvh_fast_compact = true;
+    bool bvh_compact = true;
     float memory_quota = 0.8;
+    int segs_per_aabb = 32;
   };
 
   RelateEngine() = default;
@@ -40,80 +40,94 @@ class RelateEngine {
   RelateEngine(const DeviceGeometries<POINT_T, INDEX_T>* geoms1);
 
   RelateEngine(const DeviceGeometries<POINT_T, INDEX_T>* geoms1,
-               const details::RTEngine* rt_engine);
+               const RTEngine* rt_engine);
 
   void set_config(const Config& config) { config_ = config; }
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const DeviceGeometries<POINT_T, INDEX_T>& geoms2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   template <typename GEOM2_ARRAY_VIEW_T>
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   // This is a generic version that can accept any two geometry array views
   template <typename GEOM1_ARRAY_VIEW_T, typename GEOM2_ARRAY_VIEW_T>
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const GEOM1_ARRAY_VIEW_T& geom_array1,
                 const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   // These are the specific overloads for RT-accelerated PIP queries
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-                Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2);
 
   void Evaluate(const rmm::cuda_stream_view& stream,
                 const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
                 const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2,
-                Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids);
+                Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                rmm::device_uvector<INDEX_T>& ids2);
 
   void EvaluateImpl(const rmm::cuda_stream_view& stream,
                     const PointArrayView<POINT_T, INDEX_T>& point_array,
                     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
                     const PolygonArrayView<POINT_T, INDEX_T>& poly_array,
-                    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids,
-                    bool inverse = false);
+                    Predicate predicate, rmm::device_uvector<INDEX_T>& point_ids,
+                    rmm::device_uvector<INDEX_T>& poly_ids, bool inverse = false);
 
   void EvaluateImpl(const rmm::cuda_stream_view& stream,
                     const PointArrayView<POINT_T, INDEX_T>& point_array,
                     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
                     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_poly_array,
-                    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids,
-                    bool inverse);
+                    Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+                    rmm::device_uvector<INDEX_T>& ids2, bool inverse);
+
+  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
+                         const PolygonArrayView<POINT_T, INDEX_T>& polys,
+                         ArrayView<uint32_t> poly_ids, int segs_per_aabb);
+
+  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
+                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
+                         ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb);
 
   /**
    * Build BVH for a subset of polygons
@@ -122,34 +136,27 @@ class RelateEngine {
    * @param polygon_ids
    * @param buffer
    */
-  OptixTraversableHandle BuildBVH(const rmm::cuda_stream_view& stream,
-                                  const PolygonArrayView<POINT_T, INDEX_T>& polygons,
-                                  ArrayView<uint32_t> polygon_ids,
-                                  rmm::device_uvector<INDEX_T>& seg_begins,
-                                  rmm::device_buffer& buffer,
-                                  rmm::device_uvector<INDEX_T>& aabb_poly_ids,
-                                  rmm::device_uvector<INDEX_T>& aabb_ring_ids);
+  OptixTraversableHandle BuildBVH(
+      const rmm::cuda_stream_view& stream,
+      const PolygonArrayView<POINT_T, INDEX_T>& polygons, ArrayView<uint32_t> polygon_ids,
+      int segs_per_aabb, rmm::device_buffer& buffer,
+      rmm::device_uvector<INDEX_T>& aabb_poly_ids,
+      rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+      rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets);
 
   OptixTraversableHandle BuildBVH(
       const rmm::cuda_stream_view& stream,
       const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-      ArrayView<uint32_t> multi_poly_ids, rmm::device_uvector<INDEX_T>& seg_begins,
-      rmm::device_uvector<INDEX_T>& part_begins, rmm::device_buffer& buffer,
+      ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb, rmm::device_buffer& buffer,
       rmm::device_uvector<INDEX_T>& aabb_multi_poly_ids,
       rmm::device_uvector<INDEX_T>& aabb_part_ids,
-      rmm::device_uvector<INDEX_T>& aabb_ring_ids);
-
-  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
-                         const PolygonArrayView<POINT_T, INDEX_T>& polys,
-                         ArrayView<uint32_t> poly_ids);
-
-  size_t EstimateBVHSize(const rmm::cuda_stream_view& stream,
-                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-                         ArrayView<uint32_t> multi_poly_ids);
+      rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+      rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets,
+      rmm::device_uvector<INDEX_T>& part_begins);
 
  private:
   Config config_;
   const DeviceGeometries<POINT_T, INDEX_T>* geoms1_;
-  const details::RTEngine* rt_engine_;
+  const RTEngine* rt_engine_;
 };
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.h
similarity index 75%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.h
index 555d2504c..d7c6bbecb 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/launch_parameters.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/launch_parameters.h
@@ -31,29 +31,29 @@ namespace detail {
 
 template <typename POINT_T>
 struct LaunchParamsPointQuery {
-  using box_t = Box<Point<float, POINT_T::n_dim>>;
-  // Data structures of geometries1
-  bool grouped;
-  ArrayView<uint32_t> prefix_sum;         // Only used when grouped
-  ArrayView<uint32_t> reordered_indices;  // Only used when grouped
-  ArrayView<box_t> mbrs1;                 // MBR of each feature in geometries1
+  using box_t = Box<POINT_T>;
+  // Input
+  ArrayView<box_t> rects;
+  ArrayView<POINT_T> points;
   OptixTraversableHandle handle;
-  //  Data structures of geometries2
-  ArrayView<POINT_T> points2;
-  // Output: Geom1 ID, Geom2 ID
-  QueueView<thrust::pair<uint32_t, uint32_t>> ids;
+  uint32_t* count;
+  // Output
+  QueueView<uint32_t> rect_ids;
+  ArrayView<uint32_t> point_ids;
 };
 
 template <typename POINT_T>
 struct LaunchParamsBoxQuery {
-  using box_t = Box<Point<float, POINT_T::n_dim>>;
+  using box_t = Box<POINT_T>;
   // Input
-  ArrayView<box_t> mbrs1;
-  ArrayView<box_t> mbrs2;
+  ArrayView<box_t> rects1;
+  ArrayView<box_t> rects2;
   // can be either geometries 1 or 2
   OptixTraversableHandle handle;
-  // Output: Geom2 ID, Geom2 ID
-  QueueView<thrust::pair<uint32_t, uint32_t>> ids;
+  uint32_t* count;
+  // Output
+  QueueView<uint32_t> rect1_ids;
+  ArrayView<uint32_t> rect2_ids;
 };
 
 /**
@@ -67,12 +67,15 @@ struct LaunchParamsPolygonPointQuery {
   MultiPointArrayView<point_t, index_t> multi_points;
   PointArrayView<point_t, index_t> points;
   PolygonArrayView<point_t, index_t> polygons;
-  ArrayView<index_t> polygon_ids;  // sorted
-  ArrayView<thrust::pair<index_t, index_t>> ids;
+  ArrayView<index_t> uniq_polygon_ids;  // sorted
+  index_t* query_point_ids;
+  index_t* query_polygon_ids;
+  size_t query_size;
   ArrayView<index_t> seg_begins;
   ArrayView<int> IMs;  // intersection matrices
   OptixTraversableHandle handle;
   ArrayView<index_t> aabb_poly_ids, aabb_ring_ids;
+  ArrayView<thrust::pair<index_t, index_t>> aabb_vertex_offsets;
 };
 
 /**
@@ -87,14 +90,16 @@ struct LaunchParamsPointMultiPolygonQuery {
   // Either MultiPointArrayView or PointArrayView will be used
   MultiPointArrayView<point_t, index_t> multi_points;
   PointArrayView<point_t, index_t> points;
-  ArrayView<index_t> multi_polygon_ids;  // sorted
-  ArrayView<thrust::pair<index_t, index_t>> ids;
-  ArrayView<index_t> seg_begins;
-  ArrayView<index_t> uniq_part_begins;
+  ArrayView<index_t> uniq_multi_polygon_ids;  // sorted
+  index_t* query_point_ids;
+  index_t* query_multi_polygon_ids;
+  size_t query_size;
+  ArrayView<index_t> uniq_part_begins;  // used to calculate z-index for parts
   // each query point has n elements of part_min_y and part_locations, n is # of parts
   ArrayView<int> IMs;  // intersection matrices
   OptixTraversableHandle handle;
   ArrayView<index_t> aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids;
+  ArrayView<thrust::pair<index_t, index_t>> aabb_vertex_offsets;
 };
 
 }  // namespace detail
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
similarity index 98%
rename from c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp
rename to c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
index d571feaa7..e0a4474c5 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/index/detail/rt_engine.hpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/rt/rt_engine.hpp
@@ -33,7 +33,6 @@
 #define GPUSPATIAL_OPTIX_LAUNCH_PARAMS_NAME "params"
 
 namespace gpuspatial {
-namespace details {
 
 /*! SBT record for a raygen program */
 struct __align__(OPTIX_SBT_RECORD_ALIGNMENT) RaygenRecord {
@@ -160,6 +159,9 @@ RTConfig get_default_rt_config(const std::string& ptx_root);
 
 class RTEngine {
  public:
+  RTEngine(const RTEngine&) = delete;
+  RTEngine& operator=(const RTEngine&) = delete;
+
   RTEngine();
   ~RTEngine();
 
@@ -201,5 +203,4 @@ class RTEngine {
   bool initialized_;
 };
 
-}  // namespace details
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h
index 2f6941704..4cca08fd0 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/cuda_utils.h
@@ -28,7 +28,7 @@
 
 #else
 #define DEV_HOST
-#define DEV_HOST_INLINE
+#define DEV_HOST_INLINE inline
 #define DEV_INLINE
 #define CONST_STATIC_INIT(...) = __VA_ARGS__
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h
index a35005ebe..ab6f174e7 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/exception.h
@@ -53,7 +53,7 @@ inline void optixCheck(OptixResult res, const char* call, const char* file,
     std::stringstream ss;
     ss << "OptiX API call (" << call << ") failed with error " << optixGetErrorName(res)
        << " (" << file << ":" << line << ")";
-    GPUSPATIAL_LOG_ERROR("Optix API error: {}", ss.str());
+    GPUSPATIAL_LOG_ERROR("Optix API error: %s", ss.str());
     throw GPUException(res, ss.str().c_str());
   }
 }
@@ -64,7 +64,7 @@ inline void cudaCheck(cudaError_t error, const char* call, const char* file,
     std::stringstream ss;
     ss << "CUDA API call (" << call << ") failed with error " << cudaGetErrorString(error)
        << " (" << file << ":" << line << ")";
-    GPUSPATIAL_LOG_ERROR("CUDA API error: {}", ss.str());
+    GPUSPATIAL_LOG_ERROR("CUDA API error: %s", ss.str());
     throw GPUException(ss.str().c_str());
   }
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.h
new file mode 100644
index 000000000..d5f394dd8
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/markers.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <cstdint>
+#define DISABLE_NVTX_MARKERS
+
+#ifndef DISABLE_NVTX_MARKERS
+#include <nvtx3/nvtx3.hpp>
+#endif
+
+namespace gpuspatial {
+
+struct Category {
+  static constexpr uint32_t KernelWorkitems = 1;
+  static constexpr uint32_t IntervalWorkitems = 2;
+};
+
+// Colors in ARGB format (Alpha, Red, Green, Blue)
+struct Color {
+  static constexpr uint32_t Red = 0xFF880000;
+  static constexpr uint32_t Green = 0xFF008800;
+  static constexpr uint32_t Blue = 0xFF000088;
+  static constexpr uint32_t Yellow = 0xFFFFFF00;
+  static constexpr uint32_t Default = 0;
+};
+
+#ifndef DISABLE_NVTX_MARKERS
+
+struct Instrument {
+  // ---------------------------------------------------------------------------
+  // Helper: Create attributes correctly using constructors
+  // ---------------------------------------------------------------------------
+  static nvtx3::event_attributes create_attr(const char* msg, uint32_t color_val,
+                                             uint32_t category_val) {
+    // 1. Basic Message
+    nvtx3::event_attributes attr{msg};
+
+    // 2. Apply Color (if not default)
+    if (color_val != Color::Default) {
+      // Use nvtx3::rgb wrapping the uint32_t directly usually works,
+      // but if it fails, we assign to the internal color_type directly via the generic
+      // color wrapper
+      attr = nvtx3::event_attributes{msg, nvtx3::color{color_val}};
+    }
+
+    // 3. Apply Category (if valid)
+    // Note: We cannot "append" to an existing immutable object.
+    // We must construct with all arguments at once.
+
+    if (color_val != Color::Default && category_val != 0) {
+      return nvtx3::event_attributes{msg, nvtx3::color{color_val},
+                                     nvtx3::category{category_val}};
+    } else if (color_val != Color::Default) {
+      return nvtx3::event_attributes{msg, nvtx3::color{color_val}};
+    } else if (category_val != 0) {
+      return nvtx3::event_attributes{msg, nvtx3::category{category_val}};
+    }
+
+    return attr;
+  }
+
+  // ---------------------------------------------------------------------------
+  // Instant Markers
+  // ---------------------------------------------------------------------------
+  static void Mark(const char* message, uint32_t color = Color::Default,
+                   uint32_t category = 0) {
+    nvtx3::mark(create_attr(message, color, category));
+  }
+
+  static void MarkInt(int64_t value, const char* message, uint32_t color = Color::Default,
+                      uint32_t category = 0) {
+    // Construct with payload immediately
+    // Note: If you need color+category+payload, the constructor list gets long.
+    // This covers the most common case: Message + Payload
+    if (color == Color::Default && category == 0) {
+      nvtx3::event_attributes attr{message, nvtx3::payload{value}};
+      nvtx3::mark(attr);
+    } else {
+      // Fallback: manually construct complex attribute
+      // Most NVTX3 versions support {msg, color, payload, category} in any order
+      nvtx3::event_attributes attr{message, nvtx3::color{color},
+                                   nvtx3::category{category}, nvtx3::payload{value}};
+      nvtx3::mark(attr);
+    }
+  }
+
+  static void MarkWorkitems(uint64_t items, const char* message = "Workitems") {
+    nvtx3::event_attributes attr{message, nvtx3::payload{items},
+                                 nvtx3::category{Category::KernelWorkitems}};
+    nvtx3::mark(attr);
+  }
+
+  // ---------------------------------------------------------------------------
+  // Scoped Ranges (RAII)
+  // ---------------------------------------------------------------------------
+  struct Range {
+    nvtx3::scoped_range range;
+
+    // Standard Range
+    explicit Range(const char* message, uint32_t color = Color::Default,
+                   uint32_t category = 0)
+        : range(Instrument::create_attr(message, color, category)) {}
+
+    // Payload Range (for workitems/intervals)
+    explicit Range(const char* message, uint64_t payload,
+                   uint32_t category = Category::IntervalWorkitems)
+        : range(nvtx3::event_attributes{message, nvtx3::payload{payload},
+                                        nvtx3::category{category}}) {}
+  };
+};
+
+#else
+
+// -----------------------------------------------------------------------------
+// No-Op Implementation
+// -----------------------------------------------------------------------------
+struct Instrument {
+  static inline void Mark(const char*, uint32_t = 0, uint32_t = 0) {}
+  static inline void MarkInt(int64_t, const char*, uint32_t = 0, uint32_t = 0) {}
+  static inline void MarkWorkitems(uint64_t, const char*) {}
+
+  struct Range {
+    explicit Range(const char*, uint32_t = 0, uint32_t = 0) {}
+    explicit Range(const char*, uint64_t, uint32_t = 0) {}
+  };
+};
+
+#endif  // DISABLE_NVTX_MARKERS
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h
index 29beac229..4087a58e6 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h
+++ b/c/sedona-libgpuspatial/libgpuspatial/include/gpuspatial/utils/queue.h
@@ -41,6 +41,7 @@ class Queue {
     if (counter_ == nullptr) {
       counter_ = std::make_unique<rmm::device_scalar<SIZE_T>>(stream);
     }
+    Clear(stream);
   }
 
   void Clear(const rmm::cuda_stream_view& stream) {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc b/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
index 58ef354ab..062b450bc 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/gpuspatial_c.cc
@@ -14,157 +14,369 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
+
 #include "gpuspatial/gpuspatial_c.h"
-#include "gpuspatial/index/spatial_joiner.hpp"
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/index/spatial_index.hpp"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
+#include "gpuspatial/utils/exception.h"
+
+#include "rmm/mr/device/cuda_async_memory_resource.hpp"
+#include "rmm/mr/device/per_device_resource.hpp"
+#include "rmm/mr/device/pool_memory_resource.hpp"
 
 #include <threads.h>
+#include <algorithm>
+#include <cstring>
 #include <memory>
-#define GPUSPATIAL_ERROR_MSG_BUFFER_SIZE (1024)
 
-struct GpuSpatialJoinerExporter {
-  static void Export(std::unique_ptr<gpuspatial::StreamingJoiner>& idx,
-                     struct GpuSpatialJoiner* out) {
-    out->private_data = idx.release();
-    out->init = &CInit;
+// -----------------------------------------------------------------------------
+// INTERNAL HELPERS
+// -----------------------------------------------------------------------------
+// This is what the private_data points to for the public C interfaces
+template <typename T>
+struct GpuSpatialWrapper {
+  T payload;
+  std::string last_error;  // Pointer to std::string to store last error message
+};
+
+// The unified error handling wrapper
+// Func: The lambda containing the logic
+template <typename T, typename Func>
+int SafeExecute(GpuSpatialWrapper<T>* wrapper, Func&& func) {
+  try {
+    func();
+    wrapper->last_error.clear();
+    return 0;
+  } catch (const std::exception& e) {
+    wrapper->last_error = std::string(e.what());
+    return EINVAL;
+  } catch (...) {
+    wrapper->last_error = "Unknown internal error";
+    return EINVAL;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// IMPLEMENTATION
+// -----------------------------------------------------------------------------
+
+struct GpuSpatialRuntimeExporter {
+  struct Payload {
+    std::shared_ptr<gpuspatial::RTEngine> rt_engine;
+    std::unique_ptr<rmm::mr::cuda_async_memory_resource> upstream_mr;
+    std::unique_ptr<rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource>>
+        pool_mr;
+    int device_id;
+  };
+
+  using private_data_t = GpuSpatialWrapper<Payload>;
+  static void Export(struct GpuSpatialRuntime* out) {
+    private_data_t* private_data =
+        new private_data_t{Payload{std::make_shared<gpuspatial::RTEngine>()}, ""};
+    out->init = CInit;
+    out->release = CRelease;
+    out->get_last_error = CGetLastError;
+    out->private_data = private_data;
+  }
+
+  static int CInit(GpuSpatialRuntime* self, GpuSpatialRuntimeConfig* config) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      std::string ptx_root(config->ptx_root);
+      auto rt_config = gpuspatial::get_default_rt_config(ptx_root);
+
+      GPUSPATIAL_LOG_INFO("Initializing GpuSpatialRuntime on device %d, PTX root %s",
+                          config->device_id, config->ptx_root);
+
+      CUDA_CHECK(cudaSetDevice(config->device_id));
+
+      float mem_pool_ratio = config->cuda_init_memory_pool_ratio;
+
+      if (mem_pool_ratio < 0 || mem_pool_ratio > 1) {
+        throw std::invalid_argument(
+            "cuda_init_memory_pool_ratio must be between 0 and 1");
+      }
+
+      if (mem_pool_ratio > 0) {
+        auto async_mr = std::make_unique<rmm::mr::cuda_async_memory_resource>();
+        auto pool_size = rmm::percent_of_free_device_memory(mem_pool_ratio);
+
+        GPUSPATIAL_LOG_INFO("Creating RMM pool memory resource with size %zu MB",
+                            pool_size / 1024 / 1024);
+
+        auto pool_mr = std::make_unique<
+            rmm::mr::pool_memory_resource<rmm::mr::cuda_async_memory_resource>>(
+            async_mr.get(), pool_size);
+
+        rmm::mr::set_current_device_resource(pool_mr.get());
+        static_cast<private_data_t*>(self->private_data)->payload.upstream_mr =
+            std::move(async_mr);
+        static_cast<private_data_t*>(self->private_data)->payload.pool_mr =
+            std::move(pool_mr);
+      }
+
+      static_cast<private_data_t*>(self->private_data)
+          ->payload.rt_engine->Init(rt_config);
+    });
+  }
+
+  static void CRelease(GpuSpatialRuntime* self) {
+    delete static_cast<private_data_t*>(self->private_data);
+    self->private_data = nullptr;
+  }
+
+  static const char* CGetLastError(GpuSpatialRuntime* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+};
+
+void GpuSpatialRuntimeCreate(struct GpuSpatialRuntime* runtime) {
+  GpuSpatialRuntimeExporter::Export(runtime);
+}
+
+using runtime_data_t = GpuSpatialRuntimeExporter::private_data_t;
+
+struct GpuSpatialIndexFloat2DExporter {
+  using scalar_t = float;
+  static constexpr int n_dim = 2;
+  using self_t = SedonaFloatIndex2D;
+  using spatial_index_t = gpuspatial::SpatialIndex<scalar_t, n_dim>;
+
+  struct Payload {
+    std::unique_ptr<spatial_index_t> index;
+    runtime_data_t* rdata;
+  };
+
+  struct ResultBuffer {
+    std::vector<uint32_t> build_indices;
+    std::vector<uint32_t> probe_indices;
+    ResultBuffer() = default;
+
+    ResultBuffer(const ResultBuffer&) = delete;
+    ResultBuffer& operator=(const ResultBuffer&) = delete;
+
+    ResultBuffer(ResultBuffer&&) = default;
+    ResultBuffer& operator=(ResultBuffer&&) = default;
+  };
+
+  using private_data_t = GpuSpatialWrapper<Payload>;
+  using context_t = GpuSpatialWrapper<ResultBuffer>;
+
+  static void Export(const struct GpuSpatialIndexConfig* config,
+                     struct SedonaFloatIndex2D* out) {
+    auto* rdata = static_cast<runtime_data_t*>(config->runtime->private_data);
+
+    gpuspatial::RTSpatialIndexConfig index_config;
+
+    index_config.rt_engine = rdata->payload.rt_engine;
+    index_config.concurrency = config->concurrency;
+
+    // Create SpatialIndex may involve GPU operations, set device here
+    CUDA_CHECK(cudaSetDevice(rdata->payload.device_id));
+
+    auto uniq_index = gpuspatial::CreateRTSpatialIndex<float, 2>(index_config);
+
     out->clear = &CClear;
-    out->push_build = &CPushBuild;
-    out->finish_building = &CFinishBuilding;
     out->create_context = &CCreateContext;
     out->destroy_context = &CDestroyContext;
-    out->push_stream = &CPushStream;
+    out->push_build = &CPushBuild;
+    out->finish_building = &CFinishBuilding;
+    out->probe = &CProbe;
     out->get_build_indices_buffer = &CGetBuildIndicesBuffer;
-    out->get_stream_indices_buffer = &CGetStreamIndicesBuffer;
+    out->get_probe_indices_buffer = &CGetProbeIndicesBuffer;
+    out->get_last_error = &CGetLastError;
+    out->context_get_last_error = &CContextGetLastError;
     out->release = &CRelease;
-    out->last_error = new char[GPUSPATIAL_ERROR_MSG_BUFFER_SIZE];
-  }
-
-  static int CInit(struct GpuSpatialJoiner* self, struct GpuSpatialJoinerConfig* config) {
-    int err = 0;
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    try {
-      gpuspatial::InitSpatialJoiner(joiner, config->ptx_root, config->concurrency);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static void CCreateContext(struct GpuSpatialJoiner* self,
-                             struct GpuSpatialJoinerContext* context) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    context->private_data = new std::shared_ptr(joiner->CreateContext());
-    context->last_error = new char[GPUSPATIAL_ERROR_MSG_BUFFER_SIZE];
-    context->build_indices = new std::vector<uint32_t>();
-    context->stream_indices = new std::vector<uint32_t>();
-  }
-
-  static void CDestroyContext(struct GpuSpatialJoinerContext* context) {
-    delete (std::shared_ptr<gpuspatial::StreamingJoiner::Context>*)context->private_data;
-    delete[] context->last_error;
-    delete (std::vector<uint32_t>*)context->build_indices;
-    delete (std::vector<uint32_t>*)context->stream_indices;
+    out->private_data = new private_data_t{Payload{std::move(uniq_index), rdata}, ""};
+  }
+
+  static void CCreateContext(struct SedonaSpatialIndexContext* context) {
+    context->private_data = new context_t();
+  }
+
+  static void CDestroyContext(struct SedonaSpatialIndexContext* context) {
+    delete static_cast<context_t*>(context->private_data);
     context->private_data = nullptr;
-    context->last_error = nullptr;
-    context->build_indices = nullptr;
-    context->stream_indices = nullptr;
-  }
-
-  static void CClear(struct GpuSpatialJoiner* self) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    joiner->Clear();
-  }
-
-  static int CPushBuild(struct GpuSpatialJoiner* self, const struct ArrowSchema* schema,
-                        const struct ArrowArray* array, int64_t offset, int64_t length) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    int err = 0;
-    try {
-      joiner->PushBuild(schema, array, offset, length);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static int CFinishBuilding(struct GpuSpatialJoiner* self) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    int err = 0;
-    try {
-      joiner->FinishBuilding();
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      auto* last_error = const_cast<char*>(self->last_error);
-      strncpy(last_error, e.what(), len);
-      last_error[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static int CPushStream(struct GpuSpatialJoiner* self,
-                         struct GpuSpatialJoinerContext* context,
-                         const struct ArrowSchema* schema, const struct ArrowArray* array,
-                         int64_t offset, int64_t length,
-                         enum GpuSpatialPredicate predicate, int32_t array_index_offset) {
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    auto* private_data =
-        (std::shared_ptr<gpuspatial::StreamingJoiner::Context>*)context->private_data;
-    int err = 0;
-    try {
-      joiner->PushStream(private_data->get(), schema, array, offset, length,
-                         static_cast<gpuspatial::Predicate>(predicate),
-                         static_cast<std::vector<uint32_t>*>(context->build_indices),
-                         static_cast<std::vector<uint32_t>*>(context->stream_indices),
-                         array_index_offset);
-    } catch (const std::exception& e) {
-      int len =
-          std::min(strlen(e.what()), (size_t)(GPUSPATIAL_ERROR_MSG_BUFFER_SIZE - 1));
-      strncpy((char*)context->last_error, e.what(), len);
-      ((char*)context->last_error)[len] = '\0';
-      err = EINVAL;
-    }
-    return err;
-  }
-
-  static void CGetBuildIndicesBuffer(struct GpuSpatialJoinerContext* context,
-                                     void** build_indices,
+  }
+
+  static int CClear(self_t* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [=] { use_index(self).Clear(); });
+  }
+
+  static int CPushBuild(self_t* self, const float* buf, uint32_t n_rects) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      auto* rects = reinterpret_cast<const spatial_index_t::box_t*>(buf);
+      use_index(self).PushBuild(rects, n_rects);
+    });
+  }
+
+  static int CFinishBuilding(self_t* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_index(self).FinishBuilding(); });
+  }
+
+  static int CProbe(self_t* self, SedonaSpatialIndexContext* context, const float* buf,
+                    uint32_t n_rects) {
+    return SafeExecute(static_cast<context_t*>(context->private_data), [&] {
+      auto* rects = reinterpret_cast<const spatial_index_t::box_t*>(buf);
+      auto& buff = static_cast<context_t*>(context->private_data)->payload;
+      use_index(self).Probe(rects, n_rects, &buff.build_indices, &buff.probe_indices);
+    });
+  }
+
+  static void CGetBuildIndicesBuffer(struct SedonaSpatialIndexContext* context,
+                                     uint32_t** build_indices,
                                      uint32_t* build_indices_length) {
-    auto* vec = static_cast<std::vector<uint32_t>*>(context->build_indices);
+    auto* ctx = static_cast<context_t*>(context->private_data);
+    *build_indices = ctx->payload.build_indices.data();
+    *build_indices_length = ctx->payload.build_indices.size();
+  }
+
+  static void CGetProbeIndicesBuffer(struct SedonaSpatialIndexContext* context,
+                                     uint32_t** probe_indices,
+                                     uint32_t* probe_indices_length) {
+    auto* ctx = static_cast<context_t*>(context->private_data);
+    *probe_indices = ctx->payload.probe_indices.data();
+    *probe_indices_length = ctx->payload.probe_indices.size();
+  }
+
+  static const char* CGetLastError(self_t* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+
+  static const char* CContextGetLastError(SedonaSpatialIndexContext* self) {
+    auto* private_data = static_cast<context_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+
+  static void CRelease(self_t* self) {
+    delete static_cast<private_data_t*>(self->private_data);
+    self->private_data = nullptr;
+  }
+
+  static spatial_index_t& use_index(self_t* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    auto* r_data = private_data->payload.rdata;
+
+    CUDA_CHECK(cudaSetDevice(r_data->payload.device_id));
+    return *(private_data->payload.index);
+  }
+};
+
+int GpuSpatialIndexFloat2DCreate(struct SedonaFloatIndex2D* index,
+                                 const struct GpuSpatialIndexConfig* config) {
+  try {
+    GpuSpatialIndexFloat2DExporter::Export(config, index);
+  } catch (std::exception& e) {
+    GPUSPATIAL_LOG_ERROR("Failed to create GpuSpatialIndexFloat2D: %s", e.what());
+    return EINVAL;
+  }
+  return 0;
+}
+
+struct GpuSpatialRefinerExporter {
+  struct Payload {
+    std::unique_ptr<gpuspatial::SpatialRefiner> refiner;
+    runtime_data_t* rdata;
+  };
+  using private_data_t = GpuSpatialWrapper<Payload>;
+
+  static void Export(const GpuSpatialRefinerConfig* config,
+                     struct SedonaSpatialRefiner* out) {
+    auto* rdata = static_cast<runtime_data_t*>(config->runtime->private_data);
+
+    gpuspatial::RTSpatialRefinerConfig refiner_config;
+
+    refiner_config.rt_engine = rdata->payload.rt_engine;
+    refiner_config.concurrency = config->concurrency;
+    refiner_config.compact = config->compress_bvh;
+    refiner_config.pipeline_batches = config->pipeline_batches;
+
+    // Create Refinner may involve GPU operations, set device here
+    CUDA_CHECK(cudaSetDevice(rdata->payload.device_id));
 
-    *build_indices = vec->data();
-    *build_indices_length = vec->size();
+    auto refiner = gpuspatial::CreateRTSpatialRefiner(refiner_config);
+
+    out->clear = &CClear;
+    out->push_build = &CPushBuild;
+    out->finish_building = &CFinishBuilding;
+    out->refine_loaded = &CRefineLoaded;
+    out->refine = &CRefine;
+    out->get_last_error = &CGetLastError;
+    out->release = &CRelease;
+    out->private_data = new private_data_t{Payload{std::move(refiner), rdata}, ""};
   }
 
-  static void CGetStreamIndicesBuffer(struct GpuSpatialJoinerContext* context,
-                                      void** stream_indices,
-                                      uint32_t* stream_indices_length) {
-    auto* vec = static_cast<std::vector<uint32_t>*>(context->stream_indices);
+  static int CClear(SedonaSpatialRefiner* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).Clear(); });
+  }
 
-    *stream_indices = vec->data();
-    *stream_indices_length = vec->size();
+  static int CPushBuild(SedonaSpatialRefiner* self, const ArrowSchema* build_schema,
+                        const ArrowArray* build_array) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).PushBuild(build_schema, build_array); });
   }
 
-  static void CRelease(struct GpuSpatialJoiner* self) {
-    delete[] self->last_error;
-    auto* joiner = static_cast<gpuspatial::StreamingJoiner*>(self->private_data);
-    delete joiner;
+  static int CFinishBuilding(SedonaSpatialRefiner* self) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data),
+                       [&] { use_refiner(self).FinishBuilding(); });
+  }
+
+  static int CRefineLoaded(SedonaSpatialRefiner* self, const ArrowSchema* probe_schema,
+                           const ArrowArray* probe_array,
+                           SedonaSpatialRelationPredicate predicate,
+                           uint32_t* build_indices, uint32_t* probe_indices,
+                           uint32_t indices_size, uint32_t* new_indices_size) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      *new_indices_size = use_refiner(self).Refine(
+          probe_schema, probe_array, static_cast<gpuspatial::Predicate>(predicate),
+          build_indices, probe_indices, indices_size);
+    });
+  }
+
+  static int CRefine(SedonaSpatialRefiner* self, const ArrowSchema* schema1,
+                     const ArrowArray* array1, const ArrowSchema* schema2,
+                     const ArrowArray* array2, SedonaSpatialRelationPredicate predicate,
+                     uint32_t* indices1, uint32_t* indices2, uint32_t indices_size,
+                     uint32_t* new_indices_size) {
+    return SafeExecute(static_cast<private_data_t*>(self->private_data), [&] {
+      *new_indices_size = use_refiner(self).Refine(
+          schema1, array1, schema2, array2, static_cast<gpuspatial::Predicate>(predicate),
+          indices1, indices2, indices_size);
+    });
+  }
+
+  static const char* CGetLastError(SedonaSpatialRefiner* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    return private_data->last_error.c_str();
+  }
+
+  static void CRelease(SedonaSpatialRefiner* self) {
+    delete static_cast<private_data_t*>(self->private_data);
     self->private_data = nullptr;
-    self->last_error = nullptr;
+  }
+
+  static gpuspatial::SpatialRefiner& use_refiner(SedonaSpatialRefiner* self) {
+    auto* private_data = static_cast<private_data_t*>(self->private_data);
+    auto* r_data = private_data->payload.rdata;
+
+    CUDA_CHECK(cudaSetDevice(r_data->payload.device_id));
+    return *(private_data->payload.refiner);
   }
 };
 
-void GpuSpatialJoinerCreate(struct GpuSpatialJoiner* joiner) {
-  auto idx = gpuspatial::CreateSpatialJoiner();
-  GpuSpatialJoinerExporter::Export(idx, joiner);
+int GpuSpatialRefinerCreate(SedonaSpatialRefiner* refiner,
+                            const GpuSpatialRefinerConfig* config) {
+  try {
+    GpuSpatialRefinerExporter::Export(config, refiner);
+  } catch (std::exception& e) {
+    GPUSPATIAL_LOG_ERROR("Failed to create GpuSpatialRefiner: %s", e.what());
+    return EINVAL;
+  }
+  return 0;
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
index da978012c..aaa9d4344 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/relate_engine.cu
@@ -14,25 +14,26 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/index/geometry_grouper.hpp"
-#include "gpuspatial/index/relate_engine.cuh"
 #include "gpuspatial/relate/predicate.cuh"
 #include "gpuspatial/relate/relate.cuh"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/rt/launch_parameters.h"
 #include "gpuspatial/utils/array_view.h"
 #include "gpuspatial/utils/helpers.h"
 #include "gpuspatial/utils/launcher.h"
 #include "gpuspatial/utils/logger.hpp"
-#include "gpuspatial/utils/queue.h"
 #include "rt/shaders/shader_id.hpp"
 
 #include "rmm/cuda_stream_view.hpp"
+#include "rmm/device_scalar.hpp"
 #include "rmm/exec_policy.hpp"
 
 #include <thrust/remove.h>
 #include <thrust/sort.h>
 #include <thrust/unique.h>
 
+#include "gpuspatial/utils/stopwatch.h"
+
 namespace gpuspatial {
 namespace detail {
 DEV_HOST_INLINE bool EvaluatePredicate(Predicate p, int32_t im) {
@@ -93,6 +94,92 @@ DEV_HOST_INLINE bool EvaluatePredicate(Predicate p, int32_t im) {
   }
   return false;
 }
+
+template <typename POINT_T, typename INDEX_T>
+uint32_t ComputeNumAabbs(const rmm::cuda_stream_view& stream,
+                         const PolygonArrayView<POINT_T, INDEX_T>& polygons,
+                         ArrayView<uint32_t> polygon_ids, int segs_per_aabb) {
+  auto n_polygons = polygon_ids.size();
+
+  rmm::device_uvector<uint32_t> n_aabbs(n_polygons, stream);
+  auto* p_n_aabbs = n_aabbs.data();
+
+  LaunchKernel(stream, [=] __device__() {
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    auto lane = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
+      auto id = polygon_ids[i];
+      const auto& polygon = polygons[id];
+      uint32_t total_segs = 0;
+
+      for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
+        total_segs +=
+            (polygon.get_ring(ring).num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+      }
+      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
+      if (lane == 0) {
+        p_n_aabbs[i] = total_segs;
+      }
+    }
+  });
+  return thrust::reduce(rmm::exec_policy_nosync(stream), n_aabbs.begin(), n_aabbs.end());
+}
+
+template <typename POINT_T, typename INDEX_T>
+uint32_t ComputeNumAabbs(const rmm::cuda_stream_view& stream,
+                         const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polygons,
+                         ArrayView<uint32_t> multi_polygon_ids, int segs_per_aabb) {
+  auto n_multi_polygons = multi_polygon_ids.size();
+  rmm::device_uvector<uint32_t> n_aabbs(n_multi_polygons, stream);
+  auto* p_n_aabbs = n_aabbs.data();
+
+  LaunchKernel(stream, [=] __device__() {
+    using WarpReduce = cub::WarpReduce<uint32_t>;
+    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    auto lane = threadIdx.x % 32;
+    auto warp_id = threadIdx.x / 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (auto i = global_warp_id; i < n_multi_polygons; i += n_warps) {
+      auto id = multi_polygon_ids[i];
+      const auto& multi_polygon = multi_polygons[id];
+
+      uint32_t multipoly_aabb_count = 0;
+
+      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
+        auto polygon = multi_polygon.get_polygon(part_idx);
+
+        // Local accumulator for this thread
+        uint32_t thread_aabb_count = 0;
+
+        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
+          auto n_segs = polygon.get_ring(ring).num_segments();
+
+          thread_aabb_count += (n_segs + segs_per_aabb - 1) / segs_per_aabb;
+        }
+
+        // Reduce across the warp to get total AABBs for this polygon (part)
+        uint32_t part_total = WarpReduce(temp_storage[warp_id]).Sum(thread_aabb_count);
+
+        // Add this part's total to the multi-polygon accumulator
+        if (lane == 0) {
+          multipoly_aabb_count += part_total;
+        }
+      }
+
+      if (lane == 0) {
+        p_n_aabbs[i] = multipoly_aabb_count;
+      }
+    }
+  });
+  return thrust::reduce(rmm::exec_policy_nosync(stream), n_aabbs.begin(), n_aabbs.end());
+}
 }  // namespace detail
 
 template <typename POINT_T, typename INDEX_T>
@@ -102,48 +189,49 @@ RelateEngine<POINT_T, INDEX_T>::RelateEngine(
 
 template <typename POINT_T, typename INDEX_T>
 RelateEngine<POINT_T, INDEX_T>::RelateEngine(
-    const DeviceGeometries<POINT_T, INDEX_T>* geoms1, const details::RTEngine* rt_engine)
+    const DeviceGeometries<POINT_T, INDEX_T>* geoms1, const RTEngine* rt_engine)
     : geoms1_(geoms1), rt_engine_(rt_engine) {}
 
 template <typename POINT_T, typename INDEX_T>
 void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream, const DeviceGeometries<POINT_T, INDEX_T>& geoms2,
-    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    Predicate predicate, rmm::device_uvector<INDEX_T>& ids1,
+    rmm::device_uvector<INDEX_T>& ids2) {
   switch (geoms2.get_geometry_type()) {
     case GeometryType::kPoint: {
       using geom2_array_view_t = PointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPoint: {
       using geom2_array_view_t = MultiPointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kLineString: {
       using geom2_array_view_t = LineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiLineString: {
       using geom2_array_view_t = MultiLineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kPolygon: {
       using geom2_array_view_t = PolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPolygon: {
       using geom2_array_view_t = MultiPolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms2.template GetGeometryArrayView<geom2_array_view_t>(),
-               predicate, ids);
+               predicate, ids1, ids2);
       break;
     }
     default:
@@ -153,44 +241,46 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
 
 template <typename POINT_T, typename INDEX_T>
 template <typename GEOM2_ARRAY_VIEW_T>
-void RelateEngine<POINT_T, INDEX_T>::Evaluate(
-    const rmm::cuda_stream_view& stream, const GEOM2_ARRAY_VIEW_T& geom_array2,
-    Predicate predicate, Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+void RelateEngine<POINT_T, INDEX_T>::Evaluate(const rmm::cuda_stream_view& stream,
+                                              const GEOM2_ARRAY_VIEW_T& geom_array2,
+                                              Predicate predicate,
+                                              rmm::device_uvector<INDEX_T>& ids1,
+                                              rmm::device_uvector<INDEX_T>& ids2) {
   switch (geoms1_->get_geometry_type()) {
     case GeometryType::kPoint: {
       using geom1_array_view_t = PointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPoint: {
       using geom1_array_view_t = MultiPointArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kLineString: {
       using geom1_array_view_t = LineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiLineString: {
       using geom1_array_view_t = MultiLineStringArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kPolygon: {
       using geom1_array_view_t = PolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     case GeometryType::kMultiPolygon: {
       using geom1_array_view_t = MultiPolygonArrayView<POINT_T, INDEX_T>;
       Evaluate(stream, geoms1_->template GetGeometryArrayView<geom1_array_view_t>(),
-               geom_array2, predicate, ids);
+               geom_array2, predicate, ids1, ids2);
       break;
     }
     default:
@@ -200,11 +290,14 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
 
 template <typename POINT_T, typename INDEX_T>
 template <typename GEOM1_ARRAY_VIEW_T, typename GEOM2_ARRAY_VIEW_T>
-void RelateEngine<POINT_T, INDEX_T>::Evaluate(
-    const rmm::cuda_stream_view& stream, const GEOM1_ARRAY_VIEW_T& geom_array1,
-    const GEOM2_ARRAY_VIEW_T& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  size_t ids_size = ids.size(stream);
+void RelateEngine<POINT_T, INDEX_T>::Evaluate(const rmm::cuda_stream_view& stream,
+                                              const GEOM1_ARRAY_VIEW_T& geom_array1,
+                                              const GEOM2_ARRAY_VIEW_T& geom_array2,
+                                              Predicate predicate,
+                                              rmm::device_uvector<INDEX_T>& ids1,
+                                              rmm::device_uvector<INDEX_T>& ids2) {
+  assert(ids1.size() == ids2.size());
+  size_t ids_size = ids1.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with generic kernel, geom1 %zu, geom2 %zu, predicate %s, result size %zu",
       geom_array1.size(), geom_array2.size(), PredicateToString(predicate), ids_size);
@@ -219,20 +312,24 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     GPUSPATIAL_LOG_WARN(
         "Evaluate Polygon-Polygon relate with the GPU, which is not well-tested and the performance may be poor.");
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        auto geom1_id = pair.first;
-        auto geom2_id = pair.second;
-        const auto& geom1 = geom_array1[geom1_id];
-        const auto& geom2 = geom_array2[geom2_id];
-
-        auto IM = relate(geom1, geom2);
-        return !detail::EvaluatePredicate(predicate, IM);
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto zip_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(ids1.begin(), ids2.begin()));
+  auto zip_end = thrust::make_zip_iterator(thrust::make_tuple(ids1.end(), ids2.end()));
+
+  auto end =
+      thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tuple) {
+                          auto geom1_id = thrust::get<0>(tuple);
+                          auto geom2_id = thrust::get<1>(tuple);
+                          const auto& geom1 = geom_array1[geom1_id];
+                          const auto& geom2 = geom_array2[geom2_id];
+
+                          auto IM = relate(geom1, geom2);
+                          return !detail::EvaluatePredicate(predicate, IM);
+                        });
+  size_t new_size = thrust::distance(zip_begin, end);
+  ids1.resize(new_size, stream);
+  ids2.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -240,9 +337,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PointArrayView<POINT_T, INDEX_T>& geom_array1,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array1, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -250,9 +347,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array1, geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -260,19 +357,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array2, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -280,19 +367,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array2, geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -300,9 +377,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const PointArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array1, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -310,9 +387,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array1, geom_array2,
-               predicate, ids, false /*inverse IM*/);
+               predicate, ids1, ids2, false /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -320,19 +397,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const PointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, geom_array2, MultiPointArrayView<POINT_T, INDEX_T>(), geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -340,19 +407,9 @@ void RelateEngine<POINT_T, INDEX_T>::Evaluate(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& geom_array1,
     const MultiPointArrayView<POINT_T, INDEX_T>& geom_array2, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids) {
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+    rmm::device_uvector<INDEX_T>& ids1, rmm::device_uvector<INDEX_T>& ids2) {
   EvaluateImpl(stream, PointArrayView<POINT_T, INDEX_T>(), geom_array2, geom_array1,
-               predicate, ids, true /*inverse IM*/);
-  thrust::for_each(rmm::exec_policy_nosync(stream), ids.data(),
-                   ids.data() + ids.size(stream),
-                   [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                     thrust::swap(pair.first, pair.second);
-                   });
+               predicate, ids2, ids1, true /*inverse IM*/);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -361,10 +418,12 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     const PointArrayView<POINT_T, INDEX_T>& point_array,
     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
     const PolygonArrayView<POINT_T, INDEX_T>& poly_array, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids, bool inverse) {
+    rmm::device_uvector<INDEX_T>& point_ids, rmm::device_uvector<INDEX_T>& poly_ids,
+    bool inverse) {
   using params_t = detail::LaunchParamsPolygonPointQuery<POINT_T, INDEX_T>;
-
-  size_t ids_size = ids.size(stream);
+  assert(point_array.empty() || multi_point_array.empty());
+  assert(point_ids.size() == poly_ids.size());
+  size_t ids_size = point_ids.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with ray-tracing, (multi-)point %zu, polygon %zu, predicate %s, result size %zu, inverse %d",
       !point_array.empty() ? point_array.size() : multi_point_array.size(),
@@ -373,79 +432,87 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
   if (ids_size == 0) {
     return;
   }
-  // pair.first is point id; pair.second is polygon id
-  // Sort by multi polygon id
-  thrust::sort(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-               [] __device__(const thrust::pair<uint32_t, uint32_t>& pair1,
-                             const thrust::pair<uint32_t, uint32_t>& pair2) {
-                 return pair1.second < pair2.second;
+
+  auto zip_begin =
+      thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin(), poly_ids.begin()));
+  auto zip_end =
+      thrust::make_zip_iterator(thrust::make_tuple(point_ids.end(), poly_ids.end()));
+  auto invalid_tuple = thrust::make_tuple(std::numeric_limits<INDEX_T>::max(),
+                                          std::numeric_limits<INDEX_T>::max());
+
+  // Sort by polygon id
+  thrust::sort(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+               [] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu1,
+                             const thrust::tuple<INDEX_T, INDEX_T>& tu2) {
+                 return thrust::get<1>(tu1) < thrust::get<1>(tu2);
                });
 
-  rmm::device_uvector<uint32_t> poly_ids(ids_size, stream);
+  rmm::device_uvector<INDEX_T> uniq_poly_ids(ids_size, stream);
 
-  thrust::transform(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-                    poly_ids.data(),
-                    [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                      return pair.second;
-                    });
-  auto poly_ids_end =
-      thrust::unique(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end());
-  poly_ids.resize(thrust::distance(poly_ids.begin(), poly_ids_end), stream);
-  poly_ids.shrink_to_fit(stream);
+  thrust::copy(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end(),
+               uniq_poly_ids.begin());
 
-  auto bvh_bytes = EstimateBVHSize(stream, poly_array, ArrayView<uint32_t>(poly_ids));
+  // Collect uniq polygon ids to estimate total BVH memory usage
+  auto uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
+                                          uniq_poly_ids.begin(), uniq_poly_ids.end());
+  uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), uniq_poly_ids_end),
+                       stream);
+  uniq_poly_ids.shrink_to_fit(stream);
+
+  auto bvh_bytes = EstimateBVHSize(stream, poly_array, ArrayView<uint32_t>(uniq_poly_ids),
+                                   config_.segs_per_aabb);
   size_t avail_bytes = rmm::available_device_memory().first * config_.memory_quota;
   auto n_batches = bvh_bytes / avail_bytes + 1;
   auto batch_size = (ids_size + n_batches - 1) / n_batches;
-  auto invalid_pair = thrust::make_pair(std::numeric_limits<uint32_t>::max(),
-                                        std::numeric_limits<uint32_t>::max());
 
   GPUSPATIAL_LOG_INFO(
       "Unique polygons %zu, memory quota %zu MB, estimated BVH size %zu MB",
-      poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
+      uniq_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
 
   for (int batch = 0; batch < n_batches; batch++) {
     auto ids_begin = batch * batch_size;
     auto ids_end = std::min(ids_begin + batch_size, ids_size);
     auto ids_size_batch = ids_end - ids_begin;
 
-    poly_ids.resize(ids_size_batch, stream);
-    thrust::transform(rmm::exec_policy_nosync(stream), ids.data() + ids_begin,
-                      ids.data() + ids_end, poly_ids.data(),
-                      [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                        return pair.second;
-                      });
+    // Extract unique polygon IDs in this batch
+    uniq_poly_ids.resize(ids_size_batch, stream);
+    thrust::copy(rmm::exec_policy_nosync(stream), poly_ids.begin() + ids_begin,
+                 poly_ids.begin() + ids_end, uniq_poly_ids.begin());
 
-    // ids is sorted
-    poly_ids_end =
-        thrust::unique(rmm::exec_policy_nosync(stream), poly_ids.begin(), poly_ids.end());
+    // poly ids are sorted
+    uniq_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
+                                       uniq_poly_ids.begin(), uniq_poly_ids.end());
 
-    poly_ids.resize(thrust::distance(poly_ids.begin(), poly_ids_end), stream);
-    poly_ids.shrink_to_fit(stream);
+    uniq_poly_ids.resize(thrust::distance(uniq_poly_ids.begin(), uniq_poly_ids_end),
+                         stream);
+    uniq_poly_ids.shrink_to_fit(stream);
 
     rmm::device_uvector<int> IMs(ids_size_batch, stream);
-    rmm::device_uvector<INDEX_T> seg_begins(0, stream);
     rmm::device_uvector<PointLocation> locations(ids_size_batch, stream);
     rmm::device_buffer bvh_buffer(0, stream);
     rmm::device_uvector<INDEX_T> aabb_poly_ids(0, stream), aabb_ring_ids(0, stream);
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>> aabb_vertex_offsets(0, stream);
 
     // aabb id -> vertex begin[polygon] + ith point in this polygon
-    auto handle = BuildBVH(stream, poly_array, ArrayView<INDEX_T>(poly_ids), seg_begins,
-                           bvh_buffer, aabb_poly_ids, aabb_ring_ids);
+    auto handle = BuildBVH(stream, poly_array, ArrayView<INDEX_T>(uniq_poly_ids),
+                           config_.segs_per_aabb, bvh_buffer, aabb_poly_ids,
+                           aabb_ring_ids, aabb_vertex_offsets);
 
     params_t params;
 
     params.points = point_array;
     params.multi_points = multi_point_array;
     params.polygons = poly_array;
-    params.polygon_ids = ArrayView<INDEX_T>(poly_ids);
-    params.ids = ArrayView<thrust::pair<uint32_t, uint32_t>>(ids.data() + ids_begin,
-                                                             ids_size_batch);
-    params.seg_begins = ArrayView<INDEX_T>(seg_begins);
+    params.uniq_polygon_ids = ArrayView<INDEX_T>(uniq_poly_ids);
+    params.query_point_ids = point_ids.data() + ids_begin;
+    params.query_polygon_ids = poly_ids.data() + ids_begin;
+    params.query_size = ids_size_batch;
     params.IMs = ArrayView<int>(IMs);
     params.handle = handle;
     params.aabb_poly_ids = ArrayView<INDEX_T>(aabb_poly_ids);
     params.aabb_ring_ids = ArrayView<INDEX_T>(aabb_ring_ids);
+    params.aabb_vertex_offsets =
+        ArrayView<thrust::pair<INDEX_T, INDEX_T>>(aabb_vertex_offsets);
 
     rmm::device_buffer params_buffer(sizeof(params_t), stream);
 
@@ -457,34 +524,32 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
         dim3{static_cast<unsigned int>(ids_size_batch), 1, 1},
         ArrayView<char>((char*)params_buffer.data(), params_buffer.size()));
 
-    auto* p_IMs = IMs.data();
-    auto* p_ids = ids.data();
-
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<uint32_t>(0),
-                      thrust::make_counting_iterator<uint32_t>(ids_size_batch),
-                      ids.data() + ids_begin, [=] __device__(uint32_t i) {
-                        const auto& pair = p_ids[ids_begin + i];
-
-                        auto IM = p_IMs[i];
-                        if (inverse) {
-                          IM = IntersectionMatrix::Transpose(IM);
-                        }
-                        if (detail::EvaluatePredicate(predicate, IM)) {
-                          return pair;
-                        } else {
-                          return invalid_pair;
-                        }
-                      });
+    thrust::transform(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_begin, poly_ids.begin() + ids_begin, IMs.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_end, poly_ids.begin() + ids_end, IMs.end())),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     poly_ids.begin() + ids_begin)),
+        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T, int>& t) {
+          auto res = thrust::make_tuple(thrust::get<0>(t), thrust::get<1>(t));
+          auto IM = thrust::get<2>(t);
+
+          if (inverse) {
+            IM = IntersectionMatrix::Transpose(IM);
+          }
+
+          return detail::EvaluatePredicate(predicate, IM) ? res : invalid_tuple;
+        });
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        return pair == invalid_pair;
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto end = thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                               [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu) {
+                                 return tu == invalid_tuple;
+                               });
+  size_t new_size = thrust::distance(zip_begin, end);
+  point_ids.resize(new_size, stream);
+  poly_ids.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
@@ -493,11 +558,12 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     const PointArrayView<POINT_T, INDEX_T>& point_array,
     const MultiPointArrayView<POINT_T, INDEX_T>& multi_point_array,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_poly_array, Predicate predicate,
-    Queue<thrust::pair<uint32_t, uint32_t>>& ids, bool inverse) {
+    rmm::device_uvector<INDEX_T>& point_ids, rmm::device_uvector<INDEX_T>& multi_poly_ids,
+    bool inverse) {
   using params_t = detail::LaunchParamsPointMultiPolygonQuery<POINT_T, INDEX_T>;
-
   assert(point_array.empty() || multi_point_array.empty());
-  size_t ids_size = ids.size(stream);
+  assert(point_ids.size() == multi_poly_ids.size());
+  size_t ids_size = point_ids.size();
   GPUSPATIAL_LOG_INFO(
       "Refine with ray-tracing, (multi-)point %zu, multi-polygon %zu, predicate %s, result size %zu, inverse %d",
       !point_array.empty() ? point_array.size() : multi_point_array.size(),
@@ -506,37 +572,43 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
   if (ids_size == 0) {
     return;
   }
-  // pair.first is point id; pair.second is multi polygon id
-  // Sort by multi polygon id
-  thrust::sort(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-               [] __device__(const thrust::pair<uint32_t, uint32_t>& pair1,
-                             const thrust::pair<uint32_t, uint32_t>& pair2) {
-                 return pair1.second < pair2.second;
+  auto zip_begin = thrust::make_zip_iterator(
+      thrust::make_tuple(point_ids.begin(), multi_poly_ids.begin()));
+  auto zip_end = thrust::make_zip_iterator(
+      thrust::make_tuple(point_ids.end(), multi_poly_ids.end()));
+  auto invalid_tuple = thrust::make_tuple(std::numeric_limits<INDEX_T>::max(),
+                                          std::numeric_limits<INDEX_T>::max());
+
+  // Sort by polygon id
+  thrust::sort(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+               [] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu1,
+                             const thrust::tuple<INDEX_T, INDEX_T>& tu2) {
+                 return thrust::get<1>(tu1) < thrust::get<1>(tu2);
                });
 
-  rmm::device_uvector<uint32_t> multi_poly_ids(ids_size, stream);
+  rmm::device_uvector<uint32_t> uniq_multi_poly_ids(ids_size, stream);
 
-  thrust::transform(rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-                    multi_poly_ids.data(),
-                    [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                      return pair.second;
-                    });
-  auto multi_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
-                                           multi_poly_ids.begin(), multi_poly_ids.end());
-  multi_poly_ids.resize(thrust::distance(multi_poly_ids.begin(), multi_poly_ids_end),
-                        stream);
-  multi_poly_ids.shrink_to_fit(stream);
+  thrust::copy(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
+               multi_poly_ids.end(), uniq_multi_poly_ids.begin());
+
+  // Collect uniq polygon ids to estimate total BVH memory usage
+  auto uniq_multi_poly_ids_end =
+      thrust::unique(rmm::exec_policy_nosync(stream), uniq_multi_poly_ids.begin(),
+                     uniq_multi_poly_ids.end());
+  uniq_multi_poly_ids.resize(
+      thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end), stream);
+  uniq_multi_poly_ids.shrink_to_fit(stream);
 
   auto bvh_bytes =
-      EstimateBVHSize(stream, multi_poly_array, ArrayView<uint32_t>(multi_poly_ids));
+      EstimateBVHSize(stream, multi_poly_array, ArrayView<uint32_t>(uniq_multi_poly_ids),
+                      config_.segs_per_aabb);
   size_t avail_bytes = rmm::available_device_memory().first * config_.memory_quota;
   auto n_batches = bvh_bytes / avail_bytes + 1;
   auto batch_size = (ids_size + n_batches - 1) / n_batches;
-  auto invalid_pair = thrust::make_pair(std::numeric_limits<uint32_t>::max(),
-                                        std::numeric_limits<uint32_t>::max());
+
   GPUSPATIAL_LOG_INFO(
       "Unique multi-polygons %zu, memory quota %zu MB, estimated BVH size %zu MB",
-      multi_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
+      uniq_multi_poly_ids.size(), avail_bytes / (1024 * 1024), bvh_bytes / (1024 * 1024));
 
   for (int batch = 0; batch < n_batches; batch++) {
     auto ids_begin = batch * batch_size;
@@ -544,47 +616,48 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
     auto ids_size_batch = ids_end - ids_begin;
 
     // Extract multi polygon IDs in this batch
-    multi_poly_ids.resize(ids_size_batch, stream);
+    uniq_multi_poly_ids.resize(ids_size_batch, stream);
 
-    thrust::transform(rmm::exec_policy_nosync(stream), ids.data() + ids_begin,
-                      ids.data() + ids_end, multi_poly_ids.data(),
-                      [] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-                        return pair.second;
-                      });
+    thrust::copy(rmm::exec_policy_nosync(stream), multi_poly_ids.begin() + ids_begin,
+                 multi_poly_ids.begin() + ids_end, uniq_multi_poly_ids.begin());
 
     // multi polygon ids have been sorted before
-    multi_poly_ids_end = thrust::unique(rmm::exec_policy_nosync(stream),
-                                        multi_poly_ids.begin(), multi_poly_ids.end());
-    multi_poly_ids.resize(thrust::distance(multi_poly_ids.begin(), multi_poly_ids_end),
-                          stream);
-    multi_poly_ids.shrink_to_fit(stream);
+    uniq_multi_poly_ids_end =
+        thrust::unique(rmm::exec_policy_nosync(stream), uniq_multi_poly_ids.begin(),
+                       uniq_multi_poly_ids.end());
+    uniq_multi_poly_ids.resize(
+        thrust::distance(uniq_multi_poly_ids.begin(), uniq_multi_poly_ids_end), stream);
+    uniq_multi_poly_ids.shrink_to_fit(stream);
 
     rmm::device_uvector<int> IMs(ids_size_batch, stream);
-    rmm::device_uvector<INDEX_T> seg_begins(0, stream);
-    rmm::device_uvector<INDEX_T> uniq_part_begins(0, stream);
     rmm::device_buffer bvh_buffer(0, stream);
     rmm::device_uvector<INDEX_T> aabb_multi_poly_ids(0, stream), aabb_part_ids(0, stream),
         aabb_ring_ids(0, stream);
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>> aabb_vertex_offsets(0, stream);
+    rmm::device_uvector<INDEX_T> uniq_part_begins(0, stream);
 
-    auto handle = BuildBVH(stream, multi_poly_array, ArrayView<INDEX_T>(multi_poly_ids),
-                           seg_begins, uniq_part_begins, bvh_buffer, aabb_multi_poly_ids,
-                           aabb_part_ids, aabb_ring_ids);
+    auto handle =
+        BuildBVH(stream, multi_poly_array, ArrayView<INDEX_T>(uniq_multi_poly_ids),
+                 config_.segs_per_aabb, bvh_buffer, aabb_multi_poly_ids, aabb_part_ids,
+                 aabb_ring_ids, aabb_vertex_offsets, uniq_part_begins);
 
     params_t params;
 
     params.points = point_array;
     params.multi_points = multi_point_array;
     params.multi_polygons = multi_poly_array;
-    params.multi_polygon_ids = ArrayView<INDEX_T>(multi_poly_ids);
-    params.ids = ArrayView<thrust::pair<uint32_t, uint32_t>>(ids.data() + ids_begin,
-                                                             ids_size_batch);
-    params.seg_begins = ArrayView<INDEX_T>(seg_begins);
+    params.uniq_multi_polygon_ids = ArrayView<INDEX_T>(uniq_multi_poly_ids);
+    params.query_point_ids = point_ids.data() + ids_begin;
+    params.query_multi_polygon_ids = multi_poly_ids.data() + ids_begin;
+    params.query_size = ids_size_batch;
     params.uniq_part_begins = ArrayView<INDEX_T>(uniq_part_begins);
     params.IMs = ArrayView<int>(IMs);
     params.handle = handle;
     params.aabb_multi_poly_ids = ArrayView<INDEX_T>(aabb_multi_poly_ids);
     params.aabb_part_ids = ArrayView<INDEX_T>(aabb_part_ids);
     params.aabb_ring_ids = ArrayView<INDEX_T>(aabb_ring_ids);
+    params.aabb_vertex_offsets =
+        ArrayView<thrust::pair<INDEX_T, INDEX_T>>(aabb_vertex_offsets);
 
     rmm::device_buffer params_buffer(sizeof(params_t), stream);
 
@@ -596,166 +669,90 @@ void RelateEngine<POINT_T, INDEX_T>::EvaluateImpl(
         dim3{static_cast<unsigned int>(ids_size_batch), 1, 1},
         ArrayView<char>((char*)params_buffer.data(), params_buffer.size()));
 
-    auto* p_IMs = IMs.data();
-    auto* p_ids = ids.data();
-
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::make_counting_iterator<uint32_t>(0),
-                      thrust::make_counting_iterator<uint32_t>(ids_size_batch),
-                      ids.data() + ids_begin, [=] __device__(uint32_t i) {
-                        const auto& pair = p_ids[ids_begin + i];
-
-                        auto IM = p_IMs[i];
-                        if (inverse) {
-                          IM = IntersectionMatrix::Transpose(IM);
-                        }
-                        if (detail::EvaluatePredicate(predicate, IM)) {
-                          return pair;
-                        } else {
-                          return invalid_pair;
-                        }
-                      });
+    thrust::transform(
+        rmm::exec_policy_nosync(stream),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     multi_poly_ids.begin() + ids_begin,
+                                                     IMs.begin())),
+        thrust::make_zip_iterator(thrust::make_tuple(
+            point_ids.begin() + ids_end, multi_poly_ids.begin() + ids_end, IMs.end())),
+        thrust::make_zip_iterator(thrust::make_tuple(point_ids.begin() + ids_begin,
+                                                     multi_poly_ids.begin() + ids_begin)),
+        [=] __device__(const thrust::tuple<INDEX_T, INDEX_T, int>& t) {
+          auto res = thrust::make_tuple(thrust::get<0>(t), thrust::get<1>(t));
+          auto IM = thrust::get<2>(t);
+
+          if (inverse) {
+            IM = IntersectionMatrix::Transpose(IM);
+          }
+
+          return detail::EvaluatePredicate(predicate, IM) ? res : invalid_tuple;
+        });
   }
-  auto end = thrust::remove_if(
-      rmm::exec_policy_nosync(stream), ids.data(), ids.data() + ids_size,
-      [=] __device__(const thrust::pair<uint32_t, uint32_t>& pair) {
-        return pair == invalid_pair;
-      });
-  size_t new_size = thrust::distance(ids.data(), end);
-  GPUSPATIAL_LOG_INFO("Refined, result size %zu", new_size);
-  ids.set_size(stream, new_size);
+  auto end = thrust::remove_if(rmm::exec_policy_nosync(stream), zip_begin, zip_end,
+                               [=] __device__(const thrust::tuple<INDEX_T, INDEX_T>& tu) {
+                                 return tu == invalid_tuple;
+                               });
+  size_t new_size = thrust::distance(zip_begin, end);
+  point_ids.resize(new_size, stream);
+  multi_poly_ids.resize(new_size, stream);
 }
 
 template <typename POINT_T, typename INDEX_T>
 size_t RelateEngine<POINT_T, INDEX_T>::EstimateBVHSize(
     const rmm::cuda_stream_view& stream, const PolygonArrayView<POINT_T, INDEX_T>& polys,
-    ArrayView<uint32_t> poly_ids) {
-  auto n_polygons = poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
-      auto id = poly_ids[i];
-      const auto& polygon = polys[id];
-      uint32_t total_segs = 0;
-
-      for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-        total_segs += polygon.get_ring(ring).num_points();
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-  auto total_segs =
-      thrust::reduce(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end());
-  if (total_segs == 0) {
+    ArrayView<uint32_t> poly_ids, int segs_per_aabb) {
+  auto num_aabbs = detail::ComputeNumAabbs(stream, polys, poly_ids, segs_per_aabb);
+  if (num_aabbs == 0) {
     return 0;
   }
+
   // temporary but still needed to consider this part of memory
-  auto aabb_size = total_segs * sizeof(OptixAabb);
+  auto aabb_size = num_aabbs * sizeof(OptixAabb);
   auto bvh_bytes = rt_engine_->EstimateMemoryUsageForAABB(
-      total_segs, config_.bvh_fast_build, config_.bvh_fast_compact);
-  // BVH size and aabb_poly_ids, aabb_ring_ids
-  return aabb_size + bvh_bytes + 2 * sizeof(INDEX_T) * total_segs;
+      num_aabbs, config_.bvh_fast_build, config_.bvh_compact);
+  // BVH size and aabb_poly_ids, aabb_ring_ids, aabb_vertex_offsets
+  return aabb_size + bvh_bytes + 4 * sizeof(INDEX_T) * num_aabbs;
 }
 
 template <typename POINT_T, typename INDEX_T>
 size_t RelateEngine<POINT_T, INDEX_T>::EstimateBVHSize(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-    ArrayView<uint32_t> multi_poly_ids) {
-  auto n_mult_polygons = multi_poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_mult_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
+    ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb) {
+  auto num_aabbs =
+      detail::ComputeNumAabbs(stream, multi_polys, multi_poly_ids, segs_per_aabb);
 
-    for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
-      auto id = multi_poly_ids[i];
-      const auto& multi_polygon = multi_polys[id];
-      uint32_t total_segs = 0;
-
-      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
-        auto polygon = multi_polygon.get_polygon(part_idx);
-        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-          total_segs += polygon.get_ring(ring).num_points();
-        }
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-  auto total_segs =
-      thrust::reduce(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end());
-  if (total_segs == 0) {
-    return 0;
-  }
   // temporary but still needed to consider this part of memory
-  auto aabb_size = total_segs * sizeof(OptixAabb);
+  auto aabb_size = num_aabbs * sizeof(OptixAabb);
   auto bvh_bytes = rt_engine_->EstimateMemoryUsageForAABB(
-      total_segs, config_.bvh_fast_build, config_.bvh_fast_compact);
-  // BVH size and aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids
-  return aabb_size + bvh_bytes + 3 * sizeof(INDEX_T) * total_segs;
+      num_aabbs, config_.bvh_fast_build, config_.bvh_compact);
+  // BVH size and aabb_multi_poly_ids, aabb_part_ids, aabb_ring_ids, aabb_vertex_offsets
+  return aabb_size + bvh_bytes + 5 * sizeof(INDEX_T) * num_aabbs;
 }
 
 template <typename POINT_T, typename INDEX_T>
 OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     const rmm::cuda_stream_view& stream,
     const PolygonArrayView<POINT_T, INDEX_T>& polygons, ArrayView<uint32_t> polygon_ids,
-    rmm::device_uvector<INDEX_T>& seg_begins, rmm::device_buffer& buffer,
+    int segs_per_aabb, rmm::device_buffer& buffer,
     rmm::device_uvector<INDEX_T>& aabb_poly_ids,
-    rmm::device_uvector<INDEX_T>& aabb_ring_ids) {
+    rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets) {
   auto n_polygons = polygon_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_polygons, stream);
-
-  // TODO: warp reduce
-  thrust::transform(rmm::exec_policy_nosync(stream), polygon_ids.begin(),
-                    polygon_ids.end(), n_segs.begin(),
-                    [=] __device__(const uint32_t& id) -> uint32_t {
-                      const auto& polygon = polygons[id];
-                      uint32_t total_segs = 0;
-
-                      for (int ring = 0; ring < polygon.num_rings(); ring++) {
-                        total_segs += polygon.get_ring(ring).num_points();
-                      }
-                      return total_segs;
-                    });
-
-  seg_begins = std::move(rmm::device_uvector<INDEX_T>(n_polygons + 1, stream));
-  auto* p_seg_begins = seg_begins.data();
-  seg_begins.set_element_to_zero_async(0, stream);
-
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end(),
-                         seg_begins.begin() + 1);
-
-  uint32_t num_aabbs = seg_begins.back_element(stream);
-
+  auto num_aabbs = detail::ComputeNumAabbs(stream, polygons, polygon_ids, segs_per_aabb);
   aabb_poly_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
   aabb_ring_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
+  aabb_vertex_offsets =
+      std::move(rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>(num_aabbs, stream));
 
-  auto* p_poly_ids = aabb_poly_ids.data();
-  auto* p_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_poly_ids = aabb_poly_ids.data();
+  auto* p_aabb_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_vertex_offsets = aabb_vertex_offsets.data();
 
-  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
-  auto* p_aabbs = aabbs.data();
+  rmm::device_scalar<uint32_t> d_tail(0, stream);
+
+  auto* p_tail = d_tail.data();
 
   LaunchKernel(stream.value(), [=] __device__() {
     auto lane = threadIdx.x % 32;
@@ -763,191 +760,222 @@ OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     auto n_warps = TOTAL_THREADS_1D / 32;
 
     // each warp takes a polygon
-    // i is the renumbered polygon id starting from 0
     for (auto i = global_warp_id; i < n_polygons; i += n_warps) {
       auto poly_id = polygon_ids[i];
       const auto& polygon = polygons[poly_id];
-      auto tail = p_seg_begins[i];
 
       // entire warp sequentially visit each ring
       for (uint32_t ring_idx = 0; ring_idx < polygon.num_rings(); ring_idx++) {
         auto ring = polygon.get_ring(ring_idx);
-        // this is like a hash function, its okay to overflow
-        OptixAabb aabb;
-        aabb.minZ = aabb.maxZ = i;
-
-        // each lane takes a seg
-        for (auto seg_idx = lane; seg_idx < ring.num_segments(); seg_idx += 32) {
-          const auto& seg = ring.get_line_segment(seg_idx);
-          const auto& p1 = seg.get_p1();
-          const auto& p2 = seg.get_p2();
-
-          aabb.minX = std::min(p1.x(), p2.x());
-          aabb.maxX = std::max(p1.x(), p2.x());
-          aabb.minY = std::min(p1.y(), p2.y());
-          aabb.maxY = std::max(p1.y(), p2.y());
-
-          if (std::is_same_v<scalar_t, double>) {
-            aabb.minX = next_float_from_double(aabb.minX, -1, 2);
-            aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
-            aabb.minY = next_float_from_double(aabb.minY, -1, 2);
-            aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
-          }
-          p_aabbs[tail + seg_idx] = aabb;
-          p_poly_ids[tail + seg_idx] = poly_id;
-          p_ring_ids[tail + seg_idx] = ring_idx;
+        auto aabbs_per_ring = (ring.num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+        // e.g., num segs = 3, segs_per_aabb = 2
+        // The first aabb covers seg 0,1, with vertex id (0,1,2)
+        // The second aabb covers seg 2, with vertex id (2,3)
+        // each lane takes an aabb
+        for (auto aabb_idx = lane; aabb_idx < aabbs_per_ring; aabb_idx += 32) {
+          INDEX_T local_vertex_begin = aabb_idx * segs_per_aabb;
+          INDEX_T local_vertex_end =
+              std::min((INDEX_T)(local_vertex_begin + segs_per_aabb),
+                       (INDEX_T)ring.num_segments());
+
+          auto tail = atomicAdd(p_tail, 1);
+
+          assert(tail < num_aabbs);
+          p_aabb_poly_ids[tail] = poly_id;
+          p_aabb_ring_ids[tail] = ring_idx;
+          p_aabb_vertex_offsets[tail] =
+              thrust::make_pair(local_vertex_begin, local_vertex_end);
         }
-        tail += ring.num_segments();
-        // fill a dummy AABB, so we have aabb-vertex one-to-one relationship
-        if (lane == 0) {
-          p_aabbs[tail] = OptixAabb{0, 0, 0, 0, 0, 0};
-        }
-        tail++;
       }
-      assert(p_seg_begins[i + 1] == tail);
     }
   });
+  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
+
+  // Fill AABBs
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<uint32_t>(0),
+                    thrust::make_counting_iterator<uint32_t>(num_aabbs), aabbs.begin(),
+                    [=] __device__(const uint32_t& aabb_idx) {
+                      OptixAabb aabb;
+                      aabb.minX = std::numeric_limits<scalar_t>::max();
+                      aabb.minY = std::numeric_limits<scalar_t>::max();
+                      aabb.maxX = std::numeric_limits<scalar_t>::lowest();
+                      aabb.maxY = std::numeric_limits<scalar_t>::lowest();
+
+                      auto poly_id = p_aabb_poly_ids[aabb_idx];
+                      auto ring_id = p_aabb_ring_ids[aabb_idx];
+                      auto vertex_offset_pair = p_aabb_vertex_offsets[aabb_idx];
+                      const auto& polygon = polygons[poly_id];
+                      const auto& ring = polygon.get_ring(ring_id);
+
+                      for (auto vidx = vertex_offset_pair.first;
+                           vidx <= vertex_offset_pair.second; vidx++) {
+                        const auto& v = ring.get_point(vidx);
+                        float x = v.x();
+                        float y = v.y();
+
+                        aabb.minX = fminf(aabb.minX, x);
+                        aabb.maxX = fmaxf(aabb.maxX, x);
+                        aabb.minY = fminf(aabb.minY, y);
+                        aabb.maxY = fmaxf(aabb.maxY, y);
+                      }
+
+                      if (std::is_same_v<scalar_t, double>) {
+                        aabb.minX = next_float_from_double(aabb.minX, -1, 2);
+                        aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
+                        aabb.minY = next_float_from_double(aabb.minY, -1, 2);
+                        aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
+                      }
+                      // Using minZ/maxZ to store polygon id for better filtering
+                      // Refer to polygon_point_query.cu
+                      aabb.minZ = aabb.maxZ = poly_id;
+                      return aabb;
+                    });
+
   assert(rt_engine_ != nullptr);
   return rt_engine_->BuildAccelCustom(stream.value(), ArrayView<OptixAabb>(aabbs), buffer,
-                                      config_.bvh_fast_build, config_.bvh_fast_compact);
+                                      config_.bvh_fast_build, config_.bvh_compact);
 }
 
 template <typename POINT_T, typename INDEX_T>
 OptixTraversableHandle RelateEngine<POINT_T, INDEX_T>::BuildBVH(
     const rmm::cuda_stream_view& stream,
     const MultiPolygonArrayView<POINT_T, INDEX_T>& multi_polys,
-    ArrayView<uint32_t> multi_poly_ids, rmm::device_uvector<INDEX_T>& seg_begins,
-    rmm::device_uvector<INDEX_T>& part_begins, rmm::device_buffer& buffer,
+    ArrayView<uint32_t> multi_poly_ids, int segs_per_aabb, rmm::device_buffer& buffer,
     rmm::device_uvector<INDEX_T>& aabb_multi_poly_ids,
     rmm::device_uvector<INDEX_T>& aabb_part_ids,
-    rmm::device_uvector<INDEX_T>& aabb_ring_ids) {
+    rmm::device_uvector<INDEX_T>& aabb_ring_ids,
+    rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>& aabb_vertex_offsets,
+    rmm::device_uvector<INDEX_T>& part_begins) {
   auto n_mult_polygons = multi_poly_ids.size();
-  rmm::device_uvector<uint32_t> n_segs(n_mult_polygons, stream);
-  auto* p_nsegs = n_segs.data();
-
-  LaunchKernel(stream, [=] __device__() {
-    using WarpReduce = cub::WarpReduce<uint32_t>;
-    __shared__ WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
-    auto lane = threadIdx.x % 32;
-    auto warp_id = threadIdx.x / 32;
-    auto global_warp_id = TID_1D / 32;
-    auto n_warps = TOTAL_THREADS_1D / 32;
-
-    for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
-      auto id = multi_poly_ids[i];
-      const auto& multi_polygon = multi_polys[id];
-      uint32_t total_segs = 0;
-
-      for (int part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
-        auto polygon = multi_polygon.get_polygon(part_idx);
-        for (auto ring = lane; ring < polygon.num_rings(); ring += 32) {
-          total_segs += polygon.get_ring(ring).num_points();
-        }
-      }
-      total_segs = WarpReduce(temp_storage[warp_id]).Sum(total_segs);
-      if (lane == 0) {
-        p_nsegs[i] = total_segs;
-      }
-    }
-  });
-
-  seg_begins = std::move(rmm::device_uvector<INDEX_T>(n_mult_polygons + 1, stream));
-  auto* p_seg_begins = seg_begins.data();
-  seg_begins.set_element_to_zero_async(0, stream);
-
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), n_segs.begin(), n_segs.end(),
-                         seg_begins.begin() + 1);
 
-  // each line seg is corresponding to an AABB and each ring includes an empty AABB
-  uint32_t num_aabbs = seg_begins.back_element(stream);
+  auto num_aabbs =
+      detail::ComputeNumAabbs(stream, multi_polys, multi_poly_ids, segs_per_aabb);
+  if (num_aabbs == 0) {
+    return 0;
+  }
 
   aabb_multi_poly_ids = std::move(rmm::device_uvector<INDEX_T>(num_aabbs, stream));
   aabb_part_ids = std::move(rmm::device_uvector<uint32_t>(num_aabbs, stream));
   aabb_ring_ids = std::move(rmm::device_uvector<uint32_t>(num_aabbs, stream));
+  aabb_vertex_offsets =
+      std::move(rmm::device_uvector<thrust::pair<INDEX_T, INDEX_T>>(num_aabbs, stream));
+  rmm::device_uvector<INDEX_T> aabb_seq_ids(num_aabbs, stream);
 
-  auto* p_multi_poly_ids = aabb_multi_poly_ids.data();
-  auto* p_part_ids = aabb_part_ids.data();
-  auto* p_ring_ids = aabb_ring_ids.data();
-
-  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
-  auto* p_aabbs = aabbs.data();
-
-  rmm::device_uvector<uint32_t> num_parts(n_mult_polygons, stream);
+  auto* p_aabb_multi_poly_ids = aabb_multi_poly_ids.data();
+  auto* p_aabb_part_ids = aabb_part_ids.data();
+  auto* p_aabb_ring_ids = aabb_ring_ids.data();
+  auto* p_aabb_vertex_offsets = aabb_vertex_offsets.data();
+  auto* p_aabb_seq_ids = aabb_seq_ids.data();
 
-  thrust::transform(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
-                    multi_poly_ids.end(), num_parts.begin(), [=] __device__(uint32_t id) {
-                      const auto& multi_polygon = multi_polys[id];
-                      return multi_polygon.num_polygons();
-                    });
+  rmm::device_scalar<uint32_t> d_tail(0, stream);
 
-  part_begins = std::move(rmm::device_uvector<uint32_t>(n_mult_polygons + 1, stream));
-  auto* p_part_begins = part_begins.data();
-  part_begins.set_element_to_zero_async(0, stream);
-  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), num_parts.begin(),
-                         num_parts.end(), part_begins.begin() + 1);
-  num_parts.resize(0, stream);
-  num_parts.shrink_to_fit(stream);
+  auto* p_tail = d_tail.data();
 
   LaunchKernel(stream.value(), [=] __device__() {
     auto lane = threadIdx.x % 32;
     auto global_warp_id = TID_1D / 32;
     auto n_warps = TOTAL_THREADS_1D / 32;
 
-    // each warp takes a multi polygon
-    // i is the renumbered polygon id starting from 0
+    // each warp takes a polygon
     for (auto i = global_warp_id; i < n_mult_polygons; i += n_warps) {
       auto multi_poly_id = multi_poly_ids[i];
       const auto& multi_polygon = multi_polys[multi_poly_id];
-      auto tail = p_seg_begins[i];
 
-      // entire warp sequentially visit each part
       for (uint32_t part_idx = 0; part_idx < multi_polygon.num_polygons(); part_idx++) {
         auto polygon = multi_polygon.get_polygon(part_idx);
-
         // entire warp sequentially visit each ring
         for (uint32_t ring_idx = 0; ring_idx < polygon.num_rings(); ring_idx++) {
           auto ring = polygon.get_ring(ring_idx);
-          // this is like a hash function, its okay to overflow
-          OptixAabb aabb;
-          aabb.minZ = aabb.maxZ = p_part_begins[i] + part_idx;
-
-          // each lane takes a seg
-          for (auto seg_idx = lane; seg_idx < ring.num_segments(); seg_idx += 32) {
-            const auto& seg = ring.get_line_segment(seg_idx);
-            const auto& p1 = seg.get_p1();
-            const auto& p2 = seg.get_p2();
-
-            aabb.minX = std::min(p1.x(), p2.x());
-            aabb.maxX = std::max(p1.x(), p2.x());
-            aabb.minY = std::min(p1.y(), p2.y());
-            aabb.maxY = std::max(p1.y(), p2.y());
-
-            if (std::is_same_v<scalar_t, double>) {
-              aabb.minX = next_float_from_double(aabb.minX, -1, 2);
-              aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
-              aabb.minY = next_float_from_double(aabb.minY, -1, 2);
-              aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
-            }
-            p_aabbs[tail + seg_idx] = aabb;
-            p_multi_poly_ids[tail + seg_idx] = multi_poly_id;
-            p_part_ids[tail + seg_idx] = part_idx;
-            p_ring_ids[tail + seg_idx] = ring_idx;
-          }
-          tail += ring.num_segments();
-          // fill a dummy AABB, so we have aabb-vertex one-to-one relationship
-          if (lane == 0) {
-            p_aabbs[tail] = OptixAabb{0, 0, 0, 0, 0, 0};
+          auto aabbs_per_ring = (ring.num_segments() + segs_per_aabb - 1) / segs_per_aabb;
+          // e.g., num segs = 3, segs_per_aabb = 2
+          // The first aabb covers seg 0,1, with vertex id (0,1,2)
+          // The second aabb covers seg 2, with vertex id (2,3)
+          // each lane takes an aabb
+          for (auto aabb_idx = lane; aabb_idx < aabbs_per_ring; aabb_idx += 32) {
+            INDEX_T local_vertex_begin = aabb_idx * segs_per_aabb;
+            INDEX_T local_vertex_end =
+                std::min((INDEX_T)(local_vertex_begin + segs_per_aabb),
+                         (INDEX_T)ring.num_segments());
+
+            auto tail = atomicAdd(p_tail, 1);
+
+            assert(tail < num_aabbs);
+            p_aabb_multi_poly_ids[tail] = multi_poly_id;
+            p_aabb_part_ids[tail] = part_idx;
+            p_aabb_ring_ids[tail] = ring_idx;
+            p_aabb_vertex_offsets[tail] =
+                thrust::make_pair(local_vertex_begin, local_vertex_end);
+            p_aabb_seq_ids[tail] = i;
           }
-          tail++;
         }
       }
-      assert(p_seg_begins[i + 1] == tail);
     }
   });
 
+  rmm::device_uvector<OptixAabb> aabbs(num_aabbs, stream);
+  part_begins = std::move(rmm::device_uvector<uint32_t>(n_mult_polygons + 1, stream));
+  auto* p_part_begins = part_begins.data();
+  part_begins.set_element_to_zero_async(0, stream);
+  rmm::device_uvector<uint32_t> num_parts(n_mult_polygons, stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream), multi_poly_ids.begin(),
+                    multi_poly_ids.end(), num_parts.begin(), [=] __device__(uint32_t id) {
+                      const auto& multi_polygon = multi_polys[id];
+                      return multi_polygon.num_polygons();
+                    });
+
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), num_parts.begin(),
+                         num_parts.end(), part_begins.begin() + 1);
+  num_parts.resize(0, stream);
+  num_parts.shrink_to_fit(stream);
+
+  // Fill AABBs
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<uint32_t>(0),
+                    thrust::make_counting_iterator<uint32_t>(num_aabbs), aabbs.begin(),
+                    [=] __device__(const uint32_t& aabb_idx) {
+                      OptixAabb aabb;
+                      aabb.minX = std::numeric_limits<scalar_t>::max();
+                      aabb.minY = std::numeric_limits<scalar_t>::max();
+                      aabb.maxX = std::numeric_limits<scalar_t>::lowest();
+                      aabb.maxY = std::numeric_limits<scalar_t>::lowest();
+
+                      auto multi_poly_id = p_aabb_multi_poly_ids[aabb_idx];
+                      auto part_id = p_aabb_part_ids[aabb_idx];
+                      auto ring_id = p_aabb_ring_ids[aabb_idx];
+                      auto vertex_offset_pair = p_aabb_vertex_offsets[aabb_idx];
+                      auto seq_id = p_aabb_seq_ids[aabb_idx];
+                      auto multi_polygon = multi_polys[multi_poly_id];
+                      const auto& polygon = multi_polygon.get_polygon(part_id);
+                      const auto& ring = polygon.get_ring(ring_id);
+
+                      for (auto vidx = vertex_offset_pair.first;
+                           vidx <= vertex_offset_pair.second; vidx++) {
+                        const auto& v = ring.get_point(vidx);
+                        float x = v.x();
+                        float y = v.y();
+
+                        aabb.minX = fminf(aabb.minX, x);
+                        aabb.maxX = fmaxf(aabb.maxX, x);
+                        aabb.minY = fminf(aabb.minY, y);
+                        aabb.maxY = fmaxf(aabb.maxY, y);
+                      }
+
+                      if (std::is_same_v<scalar_t, double>) {
+                        aabb.minX = next_float_from_double(aabb.minX, -1, 2);
+                        aabb.maxX = next_float_from_double(aabb.maxX, 1, 2);
+                        aabb.minY = next_float_from_double(aabb.minY, -1, 2);
+                        aabb.maxY = next_float_from_double(aabb.maxY, 1, 2);
+                      }
+
+                      aabb.minZ = aabb.maxZ = p_part_begins[seq_id] + part_id;
+                      return aabb;
+                    });
   assert(rt_engine_ != nullptr);
+
   return rt_engine_->BuildAccelCustom(stream.value(), ArrayView<OptixAabb>(aabbs), buffer,
-                                      config_.bvh_fast_build, config_.bvh_fast_compact);
+                                      config_.bvh_fast_build, config_.bvh_compact);
 }
 // Explicitly instantiate the template for specific types
 template class RelateEngine<Point<double, 2>, uint32_t>;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp b/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
index 7596e0cb3..8e1ba1252 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/rt_engine.cpp
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/rt_engine.hpp"
+#include "gpuspatial/rt/rt_engine.hpp"
 #include "gpuspatial/utils/cuda_utils.h"
 #include "gpuspatial/utils/exception.h"
 #include "gpuspatial/utils/logger.hpp"
@@ -57,8 +57,6 @@ void context_log_cb(unsigned int level, const char* tag, const char* message, vo
 }  // namespace
 
 namespace gpuspatial {
-namespace details {
-
 // --- RTConfig Method Definitions ---
 
 void RTConfig::AddModule(const Module& mod) {
@@ -103,6 +101,12 @@ RTConfig get_default_rt_config(const std::string& ptx_root) {
 RTEngine::RTEngine() : initialized_(false) {}
 
 RTEngine::~RTEngine() {
+  cudaError_t probe = cudaPeekAtLastError();
+
+  if (probe == cudaErrorCudartUnloading) {
+    GPUSPATIAL_LOG_ERROR("CUDA runtime is unloaded");
+    return;
+  }
   if (initialized_) {
     releaseOptixResources();
   }
@@ -112,6 +116,7 @@ void RTEngine::Init(const RTConfig& config) {
   if (initialized_) {
     releaseOptixResources();
   }
+  GPUSPATIAL_LOG_INFO("Initialize RTEngine");
   initOptix(config);
   createContext();
   createModule(config);
@@ -163,32 +168,34 @@ OptixTraversableHandle RTEngine::BuildAccelCustom(cudaStream_t cuda_stream,
   OPTIX_CHECK(optixAccelComputeMemoryUsage(optix_context_, &accelOptions, &build_input, 1,
                                            &blas_buffer_sizes));
 
-  GPUSPATIAL_LOG_INFO(
+  GPUSPATIAL_LOG_DEBUG(
       "ComputeBVHMemoryUsage, AABB count: %u, temp size: %zu MB, output size: %zu MB",
       num_prims, blas_buffer_sizes.tempSizeInBytes / 1024 / 1024,
       blas_buffer_sizes.outputSizeInBytes / 1024 / 1024);
 
   rmm::device_buffer temp_buf(blas_buffer_sizes.tempSizeInBytes, cuda_stream);
-  out_buf.resize(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
 
   if (compact) {
+    rmm::device_buffer uncompacted_buf(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
     rmm::device_scalar<uint64_t> compacted_size(cuda_stream);
     OptixAccelEmitDesc emitDesc;
     emitDesc.type = OPTIX_PROPERTY_TYPE_COMPACTED_SIZE;
     emitDesc.result = reinterpret_cast<CUdeviceptr>(compacted_size.data());
 
-    OPTIX_CHECK(optixAccelBuild(
-        optix_context_, cuda_stream, &accelOptions, &build_input, 1,
-        reinterpret_cast<CUdeviceptr>(temp_buf.data()), blas_buffer_sizes.tempSizeInBytes,
-        reinterpret_cast<CUdeviceptr>(out_buf.data()),
-        blas_buffer_sizes.outputSizeInBytes, &traversable, &emitDesc, 1));
+    OPTIX_CHECK(optixAccelBuild(optix_context_, cuda_stream, &accelOptions, &build_input,
+                                1, reinterpret_cast<CUdeviceptr>(temp_buf.data()),
+                                blas_buffer_sizes.tempSizeInBytes,
+                                reinterpret_cast<CUdeviceptr>(uncompacted_buf.data()),
+                                uncompacted_buf.size(), &traversable, &emitDesc, 1));
 
     auto size = compacted_size.value(cuda_stream);
     out_buf.resize(size, cuda_stream);
     OPTIX_CHECK(optixAccelCompact(optix_context_, cuda_stream, traversable,
-                                  reinterpret_cast<CUdeviceptr>(out_buf.data()), size,
-                                  &traversable));
+                                  reinterpret_cast<CUdeviceptr>(out_buf.data()),
+                                  out_buf.size(), &traversable));
   } else {
+    out_buf.resize(blas_buffer_sizes.outputSizeInBytes, cuda_stream);
+
     OPTIX_CHECK(optixAccelBuild(
         optix_context_, cuda_stream, &accelOptions, &build_input, 1,
         reinterpret_cast<CUdeviceptr>(temp_buf.data()), blas_buffer_sizes.tempSizeInBytes,
@@ -488,15 +495,14 @@ std::vector<char> RTEngine::readData(const std::string& filename) {
 }
 
 void RTEngine::releaseOptixResources() {
+  GPUSPATIAL_LOG_INFO("Release OptiX resources");
   for (auto& [id, res] : resources_) {
-    optixPipelineDestroy(res.pipeline);
-    optixProgramGroupDestroy(res.raygen_pg);
-    optixProgramGroupDestroy(res.miss_pg);
-    optixProgramGroupDestroy(res.hitgroup_pg);
-    optixModuleDestroy(res.module);
+    OPTIX_CHECK(optixPipelineDestroy(res.pipeline));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.raygen_pg));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.miss_pg));
+    OPTIX_CHECK(optixProgramGroupDestroy(res.hitgroup_pg));
+    OPTIX_CHECK(optixModuleDestroy(res.module));
   }
-  optixDeviceContextDestroy(optix_context_);
+  OPTIX_CHECK(optixDeviceContextDestroy(optix_context_));
 }
-
-}  // namespace details
 }  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
index 3ffdca9ea..f09acd913 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_backward.cu
@@ -14,8 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/relate/relate.cuh"
+#include "gpuspatial/rt/launch_parameters.h"
 #include "ray_params.h"
 #include "shader_config.h"
 
@@ -32,17 +31,22 @@ extern "C" __global__ void __intersection__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
   using ray_params_t = gpuspatial::detail::RayParams<n_dim>;
-  auto geom1_id = optixGetPayload_0();
-  auto geom2_id = optixGetPrimitiveIndex();
-  const auto& mbr1 = params.mbrs1[geom1_id];
-  const auto& mbr2 = params.mbrs2[geom2_id];
-  const auto& aabb1 = mbr1.ToOptixAabb();
-  const auto aabb2 = mbr2.ToOptixAabb();
+  auto rect1_id = optixGetPayload_0();
+  auto rect2_id = optixGetPrimitiveIndex();
+  const auto& rect1 = params.rects1[rect1_id];
+  const auto& rect2 = params.rects2[rect2_id];
+  const auto& aabb1 = rect1.ToOptixAabb();
+  const auto aabb2 = rect2.ToOptixAabb();
   ray_params_t ray_params(aabb1, false);
 
   if (ray_params.IsHit(aabb2)) {
-    if (mbr1.intersects(mbr2)) {
-      params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+    if (rect1.intersects(rect2)) {
+      if (params.count == nullptr) {
+        auto tail = params.rect1_ids.Append(rect1_id);
+        params.rect2_ids[tail] = rect2_id;
+      } else {
+        atomicAdd(params.count, 1);
+      }
     }
   }
 }
@@ -53,20 +57,18 @@ extern "C" __global__ void __raygen__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.mbrs1.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.rects1.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& mbr1 = params.mbrs1[i];
-    auto aabb1 = mbr1.ToOptixAabb();
+    const auto& rect1 = params.rects1[i];
+    if (!rect1.valid()) continue;
+    auto aabb1 = rect1.ToOptixAabb();
     gpuspatial::detail::RayParams<n_dim> ray_params(aabb1, false);
-    float3 origin, dir;
+    float3 origin{0, 0, 0}, dir{0, 0, 0};
 
-    origin.x = ray_params.o.x;
-    origin.y = ray_params.o.y;
-    origin.z = 0;
-
-    dir.x = ray_params.d.x;
-    dir.y = ray_params.d.y;
-    dir.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = (&ray_params.o.x)[dim];
+      (&dir.x)[dim] = (&ray_params.d.x)[dim];
+    }
 
     float tmin = 0;
     float tmax = 1;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
index d85d63741..424f9d3ad 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/box_query_forward.cu
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
+#include "gpuspatial/rt/launch_parameters.h"
 #include "ray_params.h"
 #include "shader_config.h"
 
@@ -31,20 +31,25 @@ extern "C" __global__ void __intersection__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
   using ray_params_t = gpuspatial::detail::RayParams<n_dim>;
-  auto geom1_id = optixGetPrimitiveIndex();
-  uint64_t geom2_id = optixGetPayload_0();
-  const auto& mbr1 = params.mbrs1[geom1_id];
-  const auto& mbr2 = params.mbrs2[geom2_id];
-  const auto& aabb1 = mbr1.ToOptixAabb();
-  const auto aabb2 = mbr2.ToOptixAabb();
+  auto rect1_id = optixGetPrimitiveIndex();
+  uint64_t rect2_id = optixGetPayload_0();
+  const auto& rect1 = params.rects1[rect1_id];
+  const auto& rect2 = params.rects2[rect2_id];
+  const auto& aabb1 = rect1.ToOptixAabb();
+  const auto aabb2 = rect2.ToOptixAabb();
 
   ray_params_t ray_params(aabb2, true);
 
   if (ray_params.IsHit(aabb1)) {  // ray cast from AABB2 hits AABB1
     ray_params = ray_params_t(aabb1, false);
     if (!ray_params.IsHit(aabb2)) {  // ray cast from AABB1 does not hit AABB2
-      if (mbr1.intersects(mbr2)) {
-        params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+      if (rect1.intersects(rect2)) {
+        if (params.count == nullptr) {
+          auto tail = params.rect1_ids.Append(rect1_id);
+          params.rect2_ids[tail] = rect2_id;
+        } else {
+          atomicAdd(params.count, 1);
+        }
       }
     }
   }
@@ -56,20 +61,20 @@ extern "C" __global__ void __raygen__gpuspatial() {
   using point_t = gpuspatial::ShaderPointType;
   constexpr int n_dim = point_t::n_dim;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.mbrs2.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.rects2.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& mbr2 = params.mbrs2[i];
-    auto aabb2 = mbr2.ToOptixAabb();
-    gpuspatial::detail::RayParams<n_dim> ray_params(aabb2, true);
-    float3 origin, dir;
+    const auto& rect2 = params.rects2[i];
+
+    if (!rect2.valid()) continue;
 
-    origin.x = ray_params.o.x;
-    origin.y = ray_params.o.y;
-    origin.z = 0;
+    auto aabb2 = rect2.ToOptixAabb();
+    gpuspatial::detail::RayParams<n_dim> ray_params(aabb2, true);
+    float3 origin{0, 0, 0}, dir{0, 0, 0};
 
-    dir.x = ray_params.d.x;
-    dir.y = ray_params.d.y;
-    dir.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = (&ray_params.o.x)[dim];
+      (&dir.x)[dim] = (&ray_params.d.x)[dim];
+    }
 
     float tmin = 0;
     float tmax = 1;
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
index 56daf449a..13aac4e03 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/config_shaders.cmake
@@ -20,7 +20,7 @@ function(CONFIG_SHADERS SHADER_PTX_FILES)
   set(SHADER_POINT_TYPES "SHADER_POINT_FLOAT_2D;SHADER_POINT_DOUBLE_2D")
 
   set(SHADERS_DEPS "${PROJECT_SOURCE_DIR}/include/gpuspatial/geom"
-                   "${PROJECT_SOURCE_DIR}/include/gpuspatial/index/detail")
+                   "${PROJECT_SOURCE_DIR}/include/gpuspatial/rt")
 
   set(OUTPUT_DIR "${PROJECT_BINARY_DIR}/shaders_ptx")
   set(OPTIX_MODULE_EXTENSION ".ptx")
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
index f96226c69..24894fb9e 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/multipolygon_point_query.cu
@@ -14,11 +14,11 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/geom/line_segment.cuh"
 #include "gpuspatial/geom/ray_crossing_counter.cuh"
-#include "gpuspatial/index/detail/launch_parameters.h"
 #include "gpuspatial/relate/relate.cuh"
+#include "gpuspatial/rt/launch_parameters.h"
 #include "gpuspatial/utils/floating_point.h"
+#include "gpuspatial/utils/helpers.h"
 #include "shader_config.h"
 
 #include <cuda_runtime.h>
@@ -44,35 +44,36 @@ extern "C" __global__ void __intersection__gpuspatial() {
   auto point_part_id = optixGetPayload_7();
 
   const auto& multi_polygons = params.multi_polygons;
-  auto point_idx = params.ids[query_idx].first;
-  auto multi_polygon_idx = params.ids[query_idx].second;
+  auto point_idx = params.query_point_ids[query_idx];
+  auto multi_polygon_idx = params.query_multi_polygon_ids[query_idx];
   auto hit_multipolygon_idx = params.aabb_multi_poly_ids[aabb_id];
   auto hit_part_idx = params.aabb_part_ids[aabb_id];
   auto hit_ring_idx = params.aabb_ring_ids[aabb_id];
-
+  const auto& vertex_offsets = params.aabb_vertex_offsets[aabb_id];
   // the seg being hit is not from the query polygon
   if (hit_multipolygon_idx != multi_polygon_idx || hit_part_idx != part_idx ||
       hit_ring_idx != ring_idx) {
     return;
   }
 
-  uint32_t local_v1_idx = aabb_id - params.seg_begins[reordered_multi_polygon_idx];
-  uint32_t global_v1_idx = v_offset + local_v1_idx;
-  uint32_t global_v2_idx = global_v1_idx + 1;
-
-  auto vertices = multi_polygons.get_vertices();
-  // segment being hit
-  const auto& v1 = vertices[global_v1_idx];
-  const auto& v2 = vertices[global_v2_idx];
-
+  const auto& multi_polygon = multi_polygons[multi_polygon_idx];
+  const auto& polygon = multi_polygon.get_polygon(part_idx);
+  const auto& ring = polygon.get_ring(ring_idx);
   RayCrossingCounter locator(crossing_count, point_on_seg);
 
-  if (!params.points.empty()) {
-    const auto& p = params.points[point_idx];
-    locator.countSegment(p, v1, v2);
-  } else if (!params.multi_points.empty()) {
-    const auto& p = params.multi_points[point_idx].get_point(point_part_id);
-    locator.countSegment(p, v1, v2);
+  // For each segment in the AABB, count crossings
+  for (auto vertex_offset = vertex_offsets.first; vertex_offset < vertex_offsets.second;
+       ++vertex_offset) {
+    const auto& v1 = ring.get_point(vertex_offset);
+    const auto& v2 = ring.get_point(vertex_offset + 1);
+
+    if (!params.points.empty()) {
+      const auto& p = params.points[point_idx];
+      locator.countSegment(p, v1, v2);
+    } else if (!params.multi_points.empty()) {
+      const auto& p = params.multi_points[point_idx].get_point(point_part_id);
+      locator.countSegment(p, v1, v2);
+    }
   }
 
   optixSetPayload_5(locator.get_crossing_count());
@@ -82,22 +83,23 @@ extern "C" __global__ void __intersection__gpuspatial() {
 extern "C" __global__ void __raygen__gpuspatial() {
   using namespace gpuspatial;
   using point_t = gpuspatial::ShaderPointType;
-  const auto& ids = params.ids;
   const auto& multi_polygons = params.multi_polygons;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < ids.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.query_size;
        i += optixGetLaunchDimensions().x) {
-    auto point_idx = ids[i].first;
-    auto multi_polygon_idx = ids[i].second;
+    auto point_idx = params.query_point_ids[i];
+    auto multi_polygon_idx = params.query_multi_polygon_ids[i];
 
-    auto it = thrust::lower_bound(thrust::seq, params.multi_polygon_ids.begin(),
-                                  params.multi_polygon_ids.end(), multi_polygon_idx);
-    assert(it != params.multi_polygon_ids.end());
+    auto it = thrust::lower_bound(thrust::seq, params.uniq_multi_polygon_ids.begin(),
+                                  params.uniq_multi_polygon_ids.end(), multi_polygon_idx);
+    assert(it != params.uniq_multi_polygon_ids.end());
     uint32_t reordered_multi_polygon_idx =
-        thrust::distance(params.multi_polygon_ids.begin(), it);
-    assert(params.multi_polygon_ids[reordered_multi_polygon_idx] == multi_polygon_idx);
+        thrust::distance(params.uniq_multi_polygon_ids.begin(), it);
+    assert(params.uniq_multi_polygon_ids[reordered_multi_polygon_idx] ==
+           multi_polygon_idx);
 
     auto handle_point = [&](const point_t& p, uint32_t point_part_id, int& IM) {
+      assert(!p.empty());
       float3 origin;
       // each polygon takes a z-plane
       origin.x = p.x();
@@ -108,7 +110,8 @@ extern "C" __global__ void __raygen__gpuspatial() {
       const auto& mbr = multi_polygon.get_mbr();
       auto width = mbr.get_max().x() - mbr.get_min().x();
       float tmin = 0;
-      float tmax = width;
+      // ensure the floating number is greater than the double
+      float tmax = next_float_from_double(width, 1, 2);
 
       // first polygon offset
       uint32_t part_offset = multi_polygons.get_prefix_sum_geoms()[multi_polygon_idx];
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
index 93f5ceb05..c5f90fcc2 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/point_query.cu
@@ -14,7 +14,7 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
+#include "gpuspatial/rt/launch_parameters.h"
 #include "shader_config.h"
 
 #include <cuda_runtime.h>
@@ -29,51 +29,38 @@ extern "C" __constant__
 
 extern "C" __global__ void __intersection__gpuspatial() {
   auto aabb_id = optixGetPrimitiveIndex();
-  auto geom2_id = optixGetPayload_0();
-  const auto& point = params.points2[geom2_id];
-  const auto& mbrs1 = params.mbrs1;
+  auto point_id = optixGetPayload_0();
+  const auto& point = params.points[point_id];
+  const auto& rect = params.rects[aabb_id];
 
-  if (params.grouped) {
-    assert(!params.prefix_sum.empty());
-    auto begin = params.prefix_sum[aabb_id];
-    auto end = params.prefix_sum[aabb_id + 1];
-
-    for (auto offset = begin; offset < end; offset++) {
-      auto geom1_id = params.reordered_indices[offset];
-      if (mbrs1.empty()) {
-        params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
-      } else {
-        const auto& mbr1 = mbrs1[geom1_id];
-
-        if (mbr1.covers(point.as_float())) {
-          params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
-        }
-      }
-    }
-  } else {
-    assert(!mbrs1.empty());
-    auto geom1_id = aabb_id;
-    const auto& mbr1 = mbrs1[geom1_id];
-
-    if (mbr1.covers(point.as_float())) {
-      params.ids.Append(thrust::make_pair(geom1_id, geom2_id));
+  if (rect.covers(point)) {
+    if (params.count == nullptr) {
+      auto tail = params.rect_ids.Append(aabb_id);
+      params.point_ids[tail] = point_id;
+    } else {
+      atomicAdd(params.count, 1);
     }
   }
 }
 
 extern "C" __global__ void __raygen__gpuspatial() {
+  using point_t = gpuspatial::ShaderPointType;
+  constexpr int n_dim = point_t::n_dim;
   float tmin = 0;
   float tmax = FLT_MIN;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < params.points2.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.points.size();
        i += optixGetLaunchDimensions().x) {
-    const auto& p = params.points2[i];
+    const auto& p = params.points[i];
+    if (p.empty()) {
+      continue;
+    }
 
-    float3 origin;
+    float3 origin{0, 0, 0};
 
-    origin.x = p.get_coordinate(0);
-    origin.y = p.get_coordinate(1);
-    origin.z = 0;
+    for (int dim = 0; dim < n_dim; dim++) {
+      (&origin.x)[dim] = p.get_coordinate(dim);
+    }
     float3 dir = {0, 0, 1};
 
     optixTrace(params.handle, origin, dir, tmin, tmax, 0, OptixVisibilityMask(255),
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
index 97cb948d1..beeb464da 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt/shaders/polygon_point_query.cu
@@ -14,10 +14,10 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-#include "gpuspatial/geom/line_segment.cuh"
 #include "gpuspatial/geom/ray_crossing_counter.cuh"
-#include "gpuspatial/index/detail/launch_parameters.h"
 #include "gpuspatial/relate/relate.cuh"
+#include "gpuspatial/rt/launch_parameters.h"
+#include "gpuspatial/utils/helpers.h"
 #include "shader_config.h"
 
 #include <cuda_runtime.h>
@@ -41,32 +41,34 @@ extern "C" __global__ void __intersection__gpuspatial() {
   auto point_on_seg = optixGetPayload_5();
   auto point_part_id = optixGetPayload_6();
   const auto& polygons = params.polygons;
-  auto point_idx = params.ids[query_idx].first;
-  auto polygon_idx = params.ids[query_idx].second;
+  auto point_idx = params.query_point_ids[query_idx];
+  auto polygon_idx = params.query_polygon_ids[query_idx];
   auto hit_polygon_idx = params.aabb_poly_ids[aabb_id];
   auto hit_ring_idx = params.aabb_ring_ids[aabb_id];
+  const auto& vertex_offsets = params.aabb_vertex_offsets[aabb_id];
   // the seg being hit is not from the query polygon
   if (hit_polygon_idx != polygon_idx || hit_ring_idx != ring_idx) {
     return;
   }
 
-  uint32_t local_v1_idx = aabb_id - params.seg_begins[reordered_polygon_idx];
-  uint32_t global_v1_idx = v_offset + local_v1_idx;
-  uint32_t global_v2_idx = global_v1_idx + 1;
+  auto ring = polygons[polygon_idx].get_ring(ring_idx);
+  RayCrossingCounter locator(crossing_count, point_on_seg);
 
-  auto vertices = polygons.get_vertices();
-  // segment being hit
-  const auto& v1 = vertices[global_v1_idx];
-  const auto& v2 = vertices[global_v2_idx];
+  // For each segment in the AABB, count crossings
+  for (auto vertex_offset = vertex_offsets.first; vertex_offset < vertex_offsets.second;
+       ++vertex_offset) {
+    const auto& v1 = ring.get_point(vertex_offset);
+    const auto& v2 = ring.get_point(vertex_offset + 1);
 
-  RayCrossingCounter locator(crossing_count, point_on_seg);
-  if (!params.points.empty()) {
-    const auto& p = params.points[point_idx];
-    locator.countSegment(p, v1, v2);
-  } else if (!params.multi_points.empty()) {
-    const auto& p = params.multi_points[point_idx].get_point(point_part_id);
-    locator.countSegment(p, v1, v2);
+    if (!params.points.empty()) {
+      const auto& p = params.points[point_idx];
+      locator.countSegment(p, v1, v2);
+    } else if (!params.multi_points.empty()) {
+      const auto& p = params.multi_points[point_idx].get_point(point_part_id);
+      locator.countSegment(p, v1, v2);
+    }
   }
+
   optixSetPayload_4(locator.get_crossing_count());
   optixSetPayload_5(locator.get_point_on_segment());
 }
@@ -74,32 +76,30 @@ extern "C" __global__ void __intersection__gpuspatial() {
 extern "C" __global__ void __raygen__gpuspatial() {
   using namespace gpuspatial;
   using point_t = gpuspatial::ShaderPointType;
-  const auto& ids = params.ids;
   const auto& polygons = params.polygons;
 
-  for (uint32_t i = optixGetLaunchIndex().x; i < ids.size();
+  for (uint32_t i = optixGetLaunchIndex().x; i < params.query_size;
        i += optixGetLaunchDimensions().x) {
-    auto point_idx = ids[i].first;
-    auto polygon_idx = ids[i].second;
+    auto point_idx = params.query_point_ids[i];
+    auto polygon_idx = params.query_polygon_ids[i];
 
-    auto it = thrust::lower_bound(thrust::seq, params.polygon_ids.begin(),
-                                  params.polygon_ids.end(), polygon_idx);
-    assert(it != params.polygon_ids.end());
-    uint32_t reordered_polygon_idx = thrust::distance(params.polygon_ids.begin(), it);
-    assert(params.polygon_ids[reordered_polygon_idx] == polygon_idx);
+    auto it = thrust::lower_bound(thrust::seq, params.uniq_polygon_ids.begin(),
+                                  params.uniq_polygon_ids.end(), polygon_idx);
+    assert(it != params.uniq_polygon_ids.end());
+    uint32_t reordered_polygon_idx =
+        thrust::distance(params.uniq_polygon_ids.begin(), it);
+    assert(params.uniq_polygon_ids[reordered_polygon_idx] == polygon_idx);
 
     auto handle_point = [&](const point_t& p, uint32_t point_part_id, int& IM) {
-      float3 origin;
-      // each polygon takes a z-plane
-      origin.x = p.x();
-      origin.y = p.y();
+      assert(!p.empty());
       // cast ray toward positive x-axis
       float3 dir = {1, 0, 0};
       const auto& polygon = polygons[polygon_idx];
       const auto& mbr = polygon.get_mbr();
       auto width = mbr.get_max().x() - mbr.get_min().x();
       float tmin = 0;
-      float tmax = width;
+      // ensure the floating number is greater than the double
+      float tmax = next_float_from_double(width, 1, 2);
 
       // first polygon offset
       uint32_t ring_offset = polygons.get_prefix_sum_polygons()[polygon_idx];
@@ -119,7 +119,11 @@ extern "C" __global__ void __raygen__gpuspatial() {
       IM |= IntersectionMatrix::EXTER_INTER_2D | IntersectionMatrix::EXTER_BOUND_1D;
       uint32_t ring = 0;
       locator.Init();
-      origin.z = reordered_polygon_idx;
+      float3 origin;
+      // each polygon takes a z-plane
+      origin.x = p.x();
+      origin.y = p.y();
+      origin.z = polygon_idx;
       // test exterior
       optixTrace(params.handle, origin, dir, tmin, tmax, 0, OptixVisibilityMask(255),
                  OPTIX_RAY_FLAG_NONE,             // OPTIX_RAY_FLAG_NONE,
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
new file mode 100644
index 000000000..67fb6abb1
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_index.cu
@@ -0,0 +1,678 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "gpuspatial/rt/launch_parameters.h"
+#include "gpuspatial/utils/launcher.h"
+#include "gpuspatial/utils/logger.hpp"
+#include "gpuspatial/utils/morton_code.h"
+#include "gpuspatial/utils/stopwatch.h"
+
+#include "rt/shaders/shader_id.hpp"
+
+#include "rmm/exec_policy.hpp"
+
+#include <thrust/logical.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#define OPTIX_MAX_RAYS (1lu << 30)
+
+namespace gpuspatial {
+namespace detail {
+
+template <typename POINT_T>
+static rmm::device_uvector<OptixAabb> ComputeAABBs(rmm::cuda_stream_view stream,
+                                                   const ArrayView<Box<POINT_T>>& mbrs) {
+  rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
+
+  thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
+                    aabbs.begin(), [] __device__(const Box<POINT_T>& mbr) {
+                      // handle empty boxes
+                      if (mbr.get_min().empty() || mbr.get_max().empty()) {
+                        // empty box
+                        OptixAabb empty_aabb;
+                        empty_aabb.minX = empty_aabb.minY = empty_aabb.minZ = 0.0f;
+                        empty_aabb.maxX = empty_aabb.maxY = empty_aabb.maxZ = -1.0f;
+                        return empty_aabb;
+                      }
+                      return mbr.ToOptixAabb();
+                    });
+  return std::move(aabbs);
+}
+
+template <typename POINT_T, typename INDEX_T>
+rmm::device_uvector<OptixAabb> ComputeAABBs(
+    rmm::cuda_stream_view stream, rmm::device_uvector<POINT_T>& points,
+    rmm::device_uvector<INDEX_T>& prefix_sum,
+    rmm::device_uvector<INDEX_T>& reordered_indices, int group_size,
+    rmm::device_uvector<Box<POINT_T>>& mbrs) {
+  using scalar_t = typename POINT_T::scalar_t;
+  using box_t = Box<POINT_T>;
+  constexpr int n_dim = POINT_T::n_dim;
+  static_assert(n_dim == 2 || n_dim == 3, "Only 2D and 3D points are supported");
+  POINT_T min_world_corner, max_world_corner;
+
+  min_world_corner.set_max();
+  max_world_corner.set_min();
+
+  for (int dim = 0; dim < n_dim; dim++) {
+    auto min_val = thrust::transform_reduce(
+        rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+        [=] __device__(const POINT_T& p) -> scalar_t { return p.get_coordinate(dim); },
+        std::numeric_limits<scalar_t>::max(), thrust::minimum<scalar_t>());
+    auto max_val = thrust::transform_reduce(
+        rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+        [=] __device__(const POINT_T& p) -> scalar_t { return p.get_coordinate(dim); },
+        std::numeric_limits<scalar_t>::lowest(), thrust::maximum<scalar_t>());
+    min_world_corner.set_coordinate(dim, min_val);
+    max_world_corner.set_coordinate(dim, max_val);
+  }
+
+  auto np = points.size();
+  rmm::device_uvector<uint32_t> morton_codes(np, stream);
+  // compute morton codes and reorder indices
+  thrust::transform(rmm::exec_policy_nosync(stream), points.begin(), points.end(),
+                    morton_codes.begin(), [=] __device__(const POINT_T& p) {
+                      POINT_T norm_p;
+
+                      for (int dim = 0; dim < n_dim; dim++) {
+                        auto min_val = min_world_corner.get_coordinate(dim);
+                        auto max_val = max_world_corner.get_coordinate(dim);
+                        auto extent = min_val == max_val ? 1 : max_val - min_val;
+                        auto norm_val = (p.get_coordinate(dim) - min_val) / extent;
+                        norm_p.set_coordinate(dim, norm_val);
+                      }
+                      return detail::morton_code(norm_p.get_vec());
+                    });
+  reordered_indices.resize(np, stream);
+  thrust::sequence(rmm::exec_policy_nosync(stream), reordered_indices.begin(),
+                   reordered_indices.end());
+  thrust::sort_by_key(rmm::exec_policy_nosync(stream), morton_codes.begin(),
+                      morton_codes.end(), reordered_indices.begin());
+  auto n_aabbs = (np + group_size - 1) / group_size;
+  mbrs.resize(n_aabbs, stream);
+  rmm::device_uvector<OptixAabb> aabbs(n_aabbs, stream);
+  rmm::device_uvector<INDEX_T> np_per_aabb(n_aabbs, stream);
+
+  auto* p_reordered_indices = reordered_indices.data();
+  auto* p_aabbs = aabbs.data();
+  auto* p_np_per_aabb = np_per_aabb.data();
+  ArrayView<POINT_T> v_points(points);
+  ArrayView<box_t> v_mbrs(mbrs);
+  // each warp takes an AABB and processes points_per_aabb points
+  LaunchKernel(stream, [=] __device__() mutable {
+    using WarpReduce = cub::WarpReduce<scalar_t>;
+    // One temp storage slot per active warp
+    __shared__ typename WarpReduce::TempStorage temp_storage[MAX_BLOCK_SIZE / 32];
+    const int warp_id = threadIdx.x / 32;
+    const int lane_id = threadIdx.x % 32;
+    // Calculate global ID of the warp to stride through AABBs
+    const int global_warp_id = (blockIdx.x * blockDim.x + threadIdx.x) / 32;
+    const int total_warps = (gridDim.x * blockDim.x) / 32;
+
+    // Grid-Stride Loop: Each warp processes one AABB (one group of points)
+    for (uint32_t aabb_id = global_warp_id; aabb_id < n_aabbs; aabb_id += total_warps) {
+      INDEX_T idx_begin = aabb_id * group_size;
+      INDEX_T idx_end = thrust::min((INDEX_T)np, (INDEX_T)(idx_begin + group_size));
+      int count = idx_end - idx_begin;
+
+      // 1. Initialize Thread-Local Accumulators (Registers)
+      // Initialize to limits so empty/out-of-bounds threads don't affect reduction
+      scalar_t thread_min[n_dim];
+      scalar_t thread_max[n_dim];
+
+#pragma unroll
+      for (int d = 0; d < n_dim; d++) {
+        thread_min[d] = std::numeric_limits<scalar_t>::max();
+        thread_max[d] = std::numeric_limits<scalar_t>::lowest();
+      }
+
+      // 2. Loop over the points in the group (Stride by 32)
+      // Every thread processes roughly group_size/32 points
+      for (int i = lane_id; i < count; i += 32) {
+        // Load index (Coalesced access to indices)
+        INDEX_T point_idx = p_reordered_indices[idx_begin + i];
+
+        // Load Point (Indirect access - unavoidable due to reordering)
+        const POINT_T& p = v_points[point_idx];
+
+// Accumulate min/max locally in registers
+#pragma unroll
+        for (int d = 0; d < n_dim; d++) {
+          scalar_t val = p.get_coordinate(d);
+          thread_min[d] = thrust::min(thread_min[d], val);
+          thread_max[d] = thrust::max(thread_max[d], val);
+        }
+      }
+
+      // 3. Warp Reduction (Perform once per dimension per AABB)
+      POINT_T final_min, final_max;
+#pragma unroll
+      for (int d = 0; d < n_dim; d++) {
+        // CUB WarpReduce handles the cross-lane communication
+        scalar_t agg_min =
+            WarpReduce(temp_storage[warp_id]).Reduce(thread_min[d], thrust::minimum<>());
+        scalar_t agg_max =
+            WarpReduce(temp_storage[warp_id]).Reduce(thread_max[d], thrust::maximum<>());
+
+        // Only lane 0 holds the valid reduction result
+        if (lane_id == 0) {
+          final_min.set_coordinate(d, agg_min);
+          final_max.set_coordinate(d, agg_max);
+        }
+      }
+
+      // 4. Store Results to Global Memory
+      if (lane_id == 0) {
+        p_np_per_aabb[aabb_id] = count;
+
+        if (count > 0) {
+          box_t ext_mbr(final_min, final_max);
+          v_mbrs[aabb_id] = ext_mbr;
+          p_aabbs[aabb_id] = ext_mbr.ToOptixAabb();
+        } else {
+          // Handle empty AABB case
+          OptixAabb empty_aabb;
+          empty_aabb.minX = empty_aabb.minY = empty_aabb.minZ = 0.0f;
+          empty_aabb.maxX = empty_aabb.maxY = empty_aabb.maxZ = -1.0f;
+          v_mbrs[aabb_id] = box_t();
+          p_aabbs[aabb_id] = empty_aabb;
+        }
+      }
+    }
+  });
+  prefix_sum.resize(n_aabbs + 1, stream);
+  prefix_sum.set_element_to_zero_async(0, stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream), np_per_aabb.begin(),
+                         np_per_aabb.end(), prefix_sum.begin() + 1);
+#ifndef NDEBUG
+  auto* p_prefix_sum = prefix_sum.data();
+
+  thrust::for_each(rmm::exec_policy_nosync(stream), thrust::counting_iterator<size_t>(0),
+                   thrust::counting_iterator<size_t>(aabbs.size()),
+                   [=] __device__(size_t aabb_idx) {
+                     auto begin = p_prefix_sum[aabb_idx];
+                     auto end = p_prefix_sum[aabb_idx + 1];
+                     const auto& aabb = p_aabbs[aabb_idx];
+
+                     for (auto i = begin; i < end; i++) {
+                       auto point_idx = p_reordered_indices[i];
+                       const auto& p = v_points[point_idx];
+                       for (int dim = 0; dim < n_dim; dim++) {
+                         auto coord = p.get_coordinate(dim);
+                         assert(coord >= (&aabb.minX)[dim] && coord <= (&aabb.maxX)[dim]);
+                         assert(v_mbrs[aabb_idx].covers(p));
+                       }
+                     }
+                   });
+#endif
+  return std::move(aabbs);
+}
+
+template <typename POINT_T, typename INDEX_T>
+void RefineExactPoints(rmm::cuda_stream_view stream, ArrayView<POINT_T> build_points,
+                       ArrayView<POINT_T> probe_points, ArrayView<INDEX_T> prefix_sum,
+                       ArrayView<INDEX_T> reordered_indices, ArrayView<INDEX_T> rect_ids,
+                       ArrayView<INDEX_T> point_ids, Queue<INDEX_T>& build_indices,
+                       ArrayView<INDEX_T> probe_indices) {
+  auto d_queue = build_indices.DeviceObject();
+
+  LaunchKernel(stream, [=] __device__() mutable {
+    auto lane_id = threadIdx.x % 32;
+    auto global_warp_id = TID_1D / 32;
+    auto n_warps = TOTAL_THREADS_1D / 32;
+
+    for (uint32_t i = global_warp_id; i < rect_ids.size(); i += n_warps) {
+      auto rect_id = rect_ids[i];
+      auto point_id = point_ids[i];
+      auto build_point_begin = prefix_sum[rect_id];
+      auto build_point_end = prefix_sum[rect_id + 1];
+
+      for (uint32_t j = lane_id + build_point_begin; j < build_point_end;
+           j += WARP_SIZE) {
+        auto build_point_id = reordered_indices[j];
+        const auto& build_point = build_points[build_point_id];
+        const auto& probe_point = probe_points[point_id];
+        if (build_point == probe_point) {
+          auto tail = d_queue.Append(build_point_id);
+          probe_indices[tail] = point_id;
+        }
+      }
+    }
+  });
+}
+}  // namespace detail
+
+template <typename SCALAR_T, int N_DIM>
+RTSpatialIndex<SCALAR_T, N_DIM>::RTSpatialIndex(const RTSpatialIndexConfig& config)
+    : config_(config),
+      stream_pool_(std::make_unique<rmm::cuda_stream_pool>(config_.concurrency)),
+      indexing_points_(false),
+      handle_(0) {}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::Clear() {
+  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), Clear", this,
+                      rmm::available_device_memory().first / 1024 / 1024);
+  auto stream = rmm::cuda_stream_default;
+  bvh_buffer_.resize(0, stream);
+  bvh_buffer_.shrink_to_fit(stream);
+  rects_.resize(0, stream);
+  rects_.shrink_to_fit(stream);
+  points_.resize(0, stream);
+  points_.shrink_to_fit(stream);
+  stream.synchronize();
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::PushBuild(const box_t* rects, uint32_t n_rects) {
+  GPUSPATIAL_LOG_INFO("RTSpatialIndex %p (Free %zu MB), PushBuild, rectangles %zu", this,
+                      rmm::available_device_memory().first / 1024 / 1024, n_rects);
+  if (n_rects == 0) return;
+  auto stream = rmm::cuda_stream_default;
+  auto prev_size = rects_.size();
+
+  rects_.resize(rects_.size() + n_rects, stream);
+  CUDA_CHECK(cudaMemcpyAsync(rects_.data() + prev_size, rects, sizeof(box_t) * n_rects,
+                             cudaMemcpyHostToDevice, stream));
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::FinishBuilding() {
+  auto stream = rmm::cuda_stream_default;
+
+  indexing_points_ = thrust::all_of(rmm::exec_policy_nosync(stream), rects_.begin(),
+                                    rects_.end(), [] __device__(const box_t& box) {
+                                      bool is_point = true;
+                                      for (int dim = 0; dim < n_dim; dim++) {
+                                        is_point &= box.get_min(dim) == box.get_max(dim);
+                                      }
+                                      return is_point;
+                                    });
+
+  rmm::device_uvector<OptixAabb> aabbs{0, stream};
+  if (indexing_points_) {
+    points_.resize(rects_.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), rects_.begin(), rects_.end(),
+                      points_.begin(),
+                      [] __device__(const box_t& box) { return box.get_min(); });
+    aabbs = std::move(detail::ComputeAABBs(stream, points_, point_ranges_,
+                                           reordered_point_indices_,
+                                           config_.n_points_per_aabb, rects_));
+  } else {
+    aabbs = std::move(detail::ComputeAABBs(stream, ArrayView<box_t>(rects_)));
+  }
+
+  handle_ = config_.rt_engine->BuildAccelCustom(stream, ArrayView<OptixAabb>(aabbs),
+                                                bvh_buffer_, config_.prefer_fast_build,
+                                                config_.compact);
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), FinishBuilding Index on %s, Total geoms: %zu",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      indexing_points_ ? "Points" : "Rectangles", numGeometries());
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::Probe(const box_t* rects, uint32_t n_rects,
+                                            std::vector<uint32_t>* build_indices,
+                                            std::vector<uint32_t>* probe_indices) {
+  if (n_rects == 0) return;
+  SpatialIndexContext ctx;
+  auto stream = stream_pool_->get_stream();
+  rmm::device_uvector<box_t> d_rects(n_rects, stream);
+  rmm::device_uvector<point_t> d_points{0, stream};
+
+  CUDA_CHECK(cudaMemcpyAsync(d_rects.data(), rects, sizeof(box_t) * n_rects,
+                             cudaMemcpyHostToDevice, stream));
+
+  bool probe_points = thrust::all_of(rmm::exec_policy_nosync(stream), d_rects.begin(),
+                                     d_rects.end(), [] __device__(const box_t& box) {
+                                       bool is_point = true;
+                                       for (int dim = 0; dim < n_dim; dim++) {
+                                         is_point &= box.get_min(dim) == box.get_max(dim);
+                                       }
+                                       return is_point;
+                                     });
+
+  if (probe_points) {
+    d_points.resize(d_rects.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
+                      d_points.begin(),
+                      [] __device__(const box_t& box) { return box.get_min(); });
+    d_rects.resize(0, stream);
+    d_rects.shrink_to_fit(stream);
+
+  } else {
+    // Build a BVH over the MBRs of the stream geometries
+#ifdef GPUSPATIAL_PROFILING
+    ctx.timer.start(stream);
+#endif
+    rmm::device_uvector<OptixAabb> aabbs(n_rects, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream), d_rects.begin(), d_rects.end(),
+                      aabbs.begin(),
+                      [] __device__(const box_t& mbr) { return mbr.ToOptixAabb(); });
+    ctx.handle = config_.rt_engine->BuildAccelCustom(
+        stream, ArrayView<OptixAabb>(aabbs), ctx.bvh_buffer, config_.prefer_fast_build,
+        config_.compact);
+#ifdef GPUSPATIAL_PROFILING
+    ctx.bvh_build_ms = ctx.timer.stop(stream);
+#endif
+  }
+
+  ctx.counter = std::make_unique<rmm::device_scalar<uint32_t>>(0, stream);
+
+  bool swap_ids = false;
+
+  auto query = [&](bool counting) {
+#ifdef GPUSPATIAL_PROFILING
+    ctx.timer.start(stream);
+#endif
+    if (indexing_points_) {
+      if (probe_points) {
+        handleBuildPoint(ctx, ArrayView<point_t>(d_points), counting);
+      } else {
+        handleBuildPoint(ctx, ArrayView<box_t>(d_rects), counting);
+        swap_ids = true;
+      }
+    } else {
+      if (probe_points) {
+        handleBuildBox(ctx, ArrayView<point_t>(d_points), counting);
+      } else {
+        handleBuildBox(ctx, ArrayView<box_t>(d_rects), counting);
+      }
+    }
+#ifdef GPUSPATIAL_PROFILING
+    ctx.rt_ms += ctx.timer.stop(stream);
+#endif
+  };
+
+  // first pass: counting
+  query(true /* counting */);
+
+  auto cap = ctx.counter->value(stream);
+  if (cap == 0) {
+    return;
+  }
+  allocateResultBuffer(ctx, cap);
+  // second pass: retrieve results
+  query(false /* counting */);
+
+  auto result_size = ctx.build_indices.size(stream);
+  ArrayView<index_t> v_build_indices(ctx.build_indices.data(), result_size);
+  ArrayView<index_t> v_probe_indices(ctx.probe_indices.data(), result_size);
+
+  if (swap_ids) {
+    // IMPORTANT: In this case, the BVH is built on probe side and points are
+    // cast on the build side, so the result pairs are (probe_id, build_id) instead of
+    // (build_id, probe_id). We need to swap the output buffers to correct this.
+    std::swap(v_build_indices, v_probe_indices);
+  }
+
+#ifdef GPUSPATIAL_PROFILING
+  Stopwatch sw;
+  sw.start();
+#endif
+  build_indices->resize(result_size);
+  CUDA_CHECK(cudaMemcpyAsync(build_indices->data(), v_build_indices.data(),
+                             sizeof(index_t) * result_size, cudaMemcpyDeviceToHost,
+                             stream));
+
+  probe_indices->resize(result_size);
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices->data(), v_probe_indices.data(),
+                             sizeof(index_t) * result_size, cudaMemcpyDeviceToHost,
+                             stream));
+  stream.synchronize();
+#ifdef GPUSPATIAL_PROFILING
+  sw.stop();
+  ctx.copy_res_ms = sw.ms();
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Probe %s, Size: %zu, Results: %zu, Alloc: %.2f ms, BVH Build: %.2f ms, RT: %.2f ms, Copy res: %.2f ms",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      probe_points ? "Points" : "Rectangles",
+      probe_points ? d_points.size() : d_rects.size(), build_indices->size(),
+      ctx.alloc_ms, ctx.bvh_build_ms, ctx.rt_ms, ctx.copy_res_ms);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildPoint(SpatialIndexContext& ctx,
+                                                       ArrayView<point_t> points,
+                                                       bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = ArrayView<box_t>(rects_);
+  launch_params.points = points;
+  launch_params.handle = handle_;
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points.size());
+
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+
+    CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                               sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                               ctx.stream));
+
+    filter(ctx, dim_x);
+  } else {
+    auto cap = ctx.build_indices.capacity();
+    Queue<index_t> rect_ids;
+    rmm::device_uvector<index_t> point_ids(cap, ctx.stream);
+
+    rect_ids.Init(ctx.stream, cap);
+
+    launch_params.count = nullptr;
+    launch_params.rect_ids = rect_ids.DeviceObject();
+    launch_params.point_ids = ArrayView<index_t>(point_ids);
+
+    CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                               sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                               ctx.stream));
+
+    filter(ctx, dim_x);
+
+    detail::RefineExactPoints<point_t, index_t>(
+        ctx.stream, ArrayView<point_t>(points_), points,
+        ArrayView<index_t>(point_ranges_), ArrayView<index_t>(reordered_point_indices_),
+        ArrayView<index_t>(rect_ids.data(), rect_ids.size(ctx.stream)),
+        ArrayView<index_t>(point_ids), ctx.build_indices,
+        ArrayView<index_t>(ctx.probe_indices));
+  }
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildPoint(SpatialIndexContext& ctx,
+                                                       ArrayView<box_t> rects,
+                                                       bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = rects;
+  launch_params.points = ArrayView<point_t>(points_);
+  launch_params.handle = ctx.handle;
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect_ids = ctx.build_indices.DeviceObject();
+    launch_params.point_ids = ArrayView<index_t>(ctx.probe_indices);
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points_.size());
+
+  filter(ctx, dim_x);
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildBox(SpatialIndexContext& ctx,
+                                                     ArrayView<point_t> points,
+                                                     bool counting) const {
+  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
+
+  ctx.shader_id = GetPointQueryShaderId<point_t>();
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects = ArrayView<box_t>(rects_);
+  launch_params.points = points;
+  launch_params.handle = handle_;
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect_ids = ctx.build_indices.DeviceObject();
+    launch_params.point_ids =
+        ArrayView<index_t>(ctx.probe_indices.data(), ctx.probe_indices.size());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+
+  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, points.size());
+
+  filter(ctx, dim_x);
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::handleBuildBox(SpatialIndexContext& ctx,
+                                                     ArrayView<box_t> rects,
+                                                     bool counting) const {
+  // forward cast: cast rays from stream geometries with the BVH of build geometries
+  {
+    auto dim_x = std::min(OPTIX_MAX_RAYS, rects.size());
+
+    prepareLaunchParamsBoxQuery(ctx, rects, true /* forward */, counting);
+    filter(ctx, dim_x);
+  }
+  // backward cast: cast rays from the build geometries with the BVH of stream geometries
+  {
+    auto dim_x = std::min(OPTIX_MAX_RAYS, rects_.size());
+
+    prepareLaunchParamsBoxQuery(ctx, rects, false /* forward */, counting);
+    filter(ctx, dim_x);
+  }
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::allocateResultBuffer(SpatialIndexContext& ctx,
+                                                           uint32_t capacity) const {
+#ifdef GPUSPATIAL_PROFILING
+  ctx.timer.start(ctx.stream);
+#endif
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialIndex %p (Free %zu MB), Allocate result buffer, memory consumption %zu MB, capacity %u",
+      this, rmm::available_device_memory().first / 1024 / 1024,
+      (uint64_t)capacity * 2 * sizeof(index_t) / 1024 / 1024, capacity);
+
+  ctx.build_indices.Init(ctx.stream, capacity);
+  ctx.probe_indices.resize(capacity, ctx.stream);
+#ifdef GPUSPATIAL_PROFILING
+  ctx.alloc_ms += ctx.timer.stop(ctx.stream);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::prepareLaunchParamsBoxQuery(
+    SpatialIndexContext& ctx, ArrayView<box_t> probe_rects, bool forward,
+    bool counting) const {
+  using launch_params_t = detail::LaunchParamsBoxQuery<point_t>;
+  ctx.launch_params_buffer.resize(sizeof(launch_params_t), ctx.stream);
+  ctx.h_launch_params_buffer.resize(sizeof(launch_params_t));
+  auto& launch_params =
+      *reinterpret_cast<launch_params_t*>(ctx.h_launch_params_buffer.data());
+
+  launch_params.rects1 = ArrayView<box_t>(rects_);
+  launch_params.rects2 = probe_rects;
+
+  if (forward) {
+    launch_params.handle = handle_;
+    ctx.shader_id = GetBoxQueryForwardShaderId<point_t>();
+  } else {
+    launch_params.handle = ctx.handle;
+    ctx.shader_id = GetBoxQueryBackwardShaderId<point_t>();
+  }
+
+  if (counting) {
+    launch_params.count = ctx.counter->data();
+  } else {
+    launch_params.count = nullptr;
+    launch_params.rect1_ids = ctx.build_indices.DeviceObject();
+    launch_params.rect2_ids = ArrayView<index_t>(ctx.probe_indices);
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(ctx.launch_params_buffer.data(), &launch_params,
+                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
+                             ctx.stream));
+}
+
+template <typename SCALAR_T, int N_DIM>
+void RTSpatialIndex<SCALAR_T, N_DIM>::filter(SpatialIndexContext& ctx,
+                                             uint32_t dim_x) const {
+#ifdef GPUSPATIAL_PROFILING
+  ctx.timer.start(ctx.stream);
+#endif
+  if (dim_x > 0) {
+    config_.rt_engine->Render(ctx.stream, ctx.shader_id, dim3{dim_x, 1, 1},
+                              ArrayView<char>((char*)ctx.launch_params_buffer.data(),
+                                              ctx.launch_params_buffer.size()));
+  }
+#ifdef GPUSPATIAL_PROFILING
+  ctx.rt_ms += ctx.timer.stop(ctx.stream);
+#endif
+}
+
+template <typename SCALAR_T, int N_DIM>
+std::unique_ptr<SpatialIndex<SCALAR_T, N_DIM>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config) {
+  auto index = std::make_unique<RTSpatialIndex<SCALAR_T, N_DIM>>(config);
+  GPUSPATIAL_LOG_INFO(
+      "Create RTSpatialIndex %p, fast_build = %d, compact = %d, concurrency = %d",
+      index.get(), config.prefer_fast_build, config.compact, config.concurrency);
+  return std::move(index);
+}
+
+template std::unique_ptr<SpatialIndex<float, 2>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<float, 3>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<double, 2>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+template std::unique_ptr<SpatialIndex<double, 3>> CreateRTSpatialIndex(
+    const RTSpatialIndexConfig& config);
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
new file mode 100644
index 000000000..c40f05dd6
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/src/rt_spatial_refiner.cu
@@ -0,0 +1,548 @@
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "gpuspatial/loader/parallel_wkb_loader.h"
+#include "gpuspatial/refine/rt_spatial_refiner.cuh"
+#include "gpuspatial/relate/relate_engine.cuh"
+#include "gpuspatial/utils/logger.hpp"
+
+#include "rt/shaders/shader_id.hpp"
+
+#include "rmm/cuda_stream_pool.hpp"
+#include "rmm/exec_policy.hpp"
+
+
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include <future>
+#include <locale>
+#include <numeric>
+#include <vector>
+
+#define OPTIX_MAX_RAYS (1lu << 30)
+
+namespace gpuspatial {
+
+namespace detail {
+template <typename INDEX_IT>
+void ReorderIndices(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                    INDEX_IT index_end,
+                    rmm::device_uvector<uint32_t>& sorted_uniq_indices,
+                    rmm::device_uvector<uint32_t>& reordered_indices) {
+  auto sorted_begin = sorted_uniq_indices.begin();
+  auto sorted_end = sorted_uniq_indices.end();
+  thrust::transform(rmm::exec_policy_nosync(stream), index_begin, index_end,
+                    reordered_indices.begin(), [=] __device__(uint32_t val) {
+                      auto it =
+                          thrust::lower_bound(thrust::seq, sorted_begin, sorted_end, val);
+                      return thrust::distance(sorted_begin, it);
+                    });
+}
+
+template <typename LoaderT, typename DeviceGeomT>
+struct PipelineSlot {
+  rmm::cuda_stream_view stream;
+  std::unique_ptr<LoaderT> loader;
+  std::future<DeviceGeomT> prep_future;
+
+  RTSpatialRefiner::IndicesMap indices_map;
+
+  // These will be moved out after every batch
+  rmm::device_uvector<uint32_t> d_batch_build_indices;
+  rmm::device_uvector<uint32_t> d_batch_probe_indices;
+
+  PipelineSlot(rmm::cuda_stream_view s, const std::shared_ptr<ThreadPool>& tp,
+               typename LoaderT::Config config)
+      : stream(s), d_batch_build_indices(0, s), d_batch_probe_indices(0, s) {
+    loader = std::make_unique<LoaderT>(tp);
+    loader->Init(config);
+  }
+};
+}  // namespace detail
+
+RTSpatialRefiner::RTSpatialRefiner(const RTSpatialRefinerConfig& config)
+    : config_(config) {
+  thread_pool_ = std::make_shared<ThreadPool>(config_.parsing_threads);
+  stream_pool_ = std::make_unique<rmm::cuda_stream_pool>(config_.concurrency);
+  CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, config_.stack_size_bytes));
+  wkb_loader_ = std::make_unique<loader_t>(thread_pool_);
+
+  ParallelWkbLoader<point_t, index_t>::Config loader_config;
+
+  loader_config.memory_quota = config_.wkb_parser_memory_quota;
+
+  wkb_loader_->Init(loader_config);
+}
+
+void RTSpatialRefiner::Clear() {
+  auto stream = rmm::cuda_stream_default;
+  wkb_loader_->Clear(stream);
+  build_geometries_.Clear(stream);
+}
+
+void RTSpatialRefiner::PushBuild(const ArrowSchema* build_schema,
+                                 const ArrowArray* build_array) {
+  auto stream = rmm::cuda_stream_default;
+
+  wkb_loader_->Parse(stream, build_schema, build_array, 0, build_array->length);
+}
+
+void RTSpatialRefiner::FinishBuilding() {
+  auto stream = rmm::cuda_stream_default;
+  build_geometries_ = std::move(wkb_loader_->Finish(stream));
+}
+
+uint32_t RTSpatialRefiner::Refine(const ArrowSchema* probe_schema,
+                                  const ArrowArray* probe_array, Predicate predicate,
+                                  uint32_t* build_indices, uint32_t* probe_indices,
+                                  uint32_t len) {
+  if (len == 0) {
+    return 0;
+  }
+
+  if (config_.pipeline_batches > 1) {
+    return RefinePipelined(probe_schema, probe_array, predicate, build_indices,
+                           probe_indices, len);
+  }
+
+  SpatialRefinerContext ctx;
+  ctx.cuda_stream = stream_pool_->get_stream();
+
+  IndicesMap probe_indices_map;
+  rmm::device_uvector<uint32_t> d_probe_indices(len, ctx.cuda_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_probe_indices.data(), probe_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             ctx.cuda_stream));
+
+  buildIndicesMap(ctx.cuda_stream, d_probe_indices.begin(), d_probe_indices.end(),
+                  probe_indices_map);
+
+  loader_t loader(thread_pool_);
+  loader_t::Config loader_config;
+  loader_config.memory_quota = config_.wkb_parser_memory_quota / config_.concurrency;
+
+  loader.Init(loader_config);
+  loader.Parse(ctx.cuda_stream, probe_schema, probe_array,
+               probe_indices_map.h_uniq_indices.begin(),
+               probe_indices_map.h_uniq_indices.end());
+  auto probe_geoms = std::move(loader.Finish(ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Loaded Geometries, ProbeArray %ld, Loaded %u, Type %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, probe_array->length,
+      probe_geoms.num_features(),
+      GeometryTypeToString(probe_geoms.get_geometry_type()).c_str());
+
+  RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
+                                               config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+
+  re_config.memory_quota = config_.relate_engine_memory_quota / config_.concurrency;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+
+  relate_engine.set_config(re_config);
+
+  rmm::device_uvector<uint32_t> d_build_indices(len, ctx.cuda_stream);
+  CUDA_CHECK(cudaMemcpyAsync(d_build_indices.data(), build_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Evaluating %u Geometry Pairs with Predicate %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, len,
+      PredicateToString(predicate));
+
+  ctx.timer.start(ctx.cuda_stream);
+  relate_engine.Evaluate(ctx.cuda_stream, probe_geoms, predicate, d_build_indices,
+                         probe_indices_map.d_reordered_indices);
+  float refine_ms = ctx.timer.stop(ctx.cuda_stream);
+  auto new_size = d_build_indices.size();
+
+  GPUSPATIAL_LOG_INFO("RTSpatialRefiner %p (Free %zu MB), Refine time %f, new size %zu",
+                      this, rmm::available_device_memory().first / 1024 / 1024, refine_ms,
+                      new_size);
+
+  d_probe_indices.resize(new_size, ctx.cuda_stream);
+
+  thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
+                 probe_indices_map.d_reordered_indices.begin(),
+                 probe_indices_map.d_reordered_indices.end(),
+                 probe_indices_map.d_uniq_indices.begin(), d_probe_indices.begin());
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(ctx.cuda_stream), d_probe_indices.begin(),
+                        d_probe_indices.end(), d_build_indices.begin());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_build_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_probe_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+  ctx.cuda_stream.synchronize();
+  return new_size;
+}
+
+uint32_t RTSpatialRefiner::RefinePipelined(const ArrowSchema* probe_schema,
+                                           const ArrowArray* probe_array,
+                                           Predicate predicate, uint32_t* build_indices,
+                                           uint32_t* probe_indices, uint32_t len) {
+  if (len == 0) return 0;
+  auto main_stream = stream_pool_->get_stream();
+
+  rmm::device_uvector<uint32_t> d_build_indices(len, main_stream);
+  rmm::device_uvector<uint32_t> d_probe_indices(len, main_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_build_indices.data(), build_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             main_stream));
+  CUDA_CHECK(cudaMemcpyAsync(d_probe_indices.data(), probe_indices,
+                             sizeof(uint32_t) * len, cudaMemcpyHostToDevice,
+                             main_stream));
+
+  thrust::sort_by_key(rmm::exec_policy_nosync(main_stream), d_probe_indices.begin(),
+                      d_probe_indices.end(), d_build_indices.begin());
+
+  rmm::device_uvector<uint32_t> d_final_build_indices(len, main_stream);
+  rmm::device_uvector<uint32_t> d_final_probe_indices(len, main_stream);
+
+  uint32_t tail_offset = 0;
+
+  // Capture device ID for thread safety
+  int device_id;
+  CUDA_CHECK(cudaGetDevice(&device_id));
+
+  // Pipeline Config
+  const int NUM_SLOTS = 2;
+  int n_batches = config_.pipeline_batches;
+  size_t batch_size = (len + n_batches - 1) / n_batches;
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p, pipeline refinement, total len %u, batches %d, batch size %zu",
+      this, len, n_batches, batch_size);
+
+  // Resource allocation for slots
+  using loader_t = ParallelWkbLoader<point_t, index_t>;
+  loader_t::Config loader_config;
+  loader_config.memory_quota =
+      config_.wkb_parser_memory_quota / config_.concurrency / NUM_SLOTS;
+
+  rmm::cuda_stream_pool local_pool(NUM_SLOTS);
+  std::vector<std::unique_ptr<detail::PipelineSlot<loader_t, dev_geometries_t>>> slots;
+
+  for (int i = 0; i < NUM_SLOTS; ++i) {
+    slots.push_back(std::make_unique<detail::PipelineSlot<loader_t, dev_geometries_t>>(
+        local_pool.get_stream(), thread_pool_, loader_config));
+  }
+
+  // Engine Setup (Shared across slots)
+  RelateEngine<point_t, index_t> relate_engine(&build_geometries_,
+                                               config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+  re_config.memory_quota =
+      config_.relate_engine_memory_quota / config_.concurrency / NUM_SLOTS;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+  relate_engine.set_config(re_config);
+
+  // --- BACKGROUND TASK (CPU Phase) ---
+  // This lambda handles: buildIndicesMap + WKB Parsing
+  auto prepare_batch_task = [&](detail::PipelineSlot<loader_t, dev_geometries_t>* slot,
+                                size_t offset, size_t count) {
+    // 1. Critical: Set context for this thread
+    CUDA_CHECK(cudaSetDevice(device_id));
+
+    // 2. Wait for GPU to finish previous work on this slot
+    slot->stream.synchronize();
+
+    // 3. Prepare Indices (CPU + H2D)
+    const uint32_t* batch_probe_ptr = d_probe_indices.data() + offset;
+    buildIndicesMap(slot->stream, batch_probe_ptr, batch_probe_ptr + count,
+                    slot->indices_map);
+
+    // 4. Parse WKB (CPU Heavy)
+    slot->loader->Clear(slot->stream);
+    slot->loader->Parse(slot->stream, probe_schema, probe_array,
+                        slot->indices_map.h_uniq_indices.begin(),
+                        slot->indices_map.h_uniq_indices.end());
+
+    // Return future geometries (H2D copy happens on Finish)
+    return slot->loader->Finish(slot->stream);
+  };
+
+  // --- PIPELINE PRIMING ---
+  // Start processing Batch 0 immediately in background
+  size_t first_batch_len = std::min(batch_size, (size_t)len);
+  slots[0]->prep_future = std::async(std::launch::async, prepare_batch_task,
+                                     slots[0].get(), 0, first_batch_len);
+
+  main_stream.synchronize();  // Ensure allocation is done before main loop
+
+  // --- MAIN PIPELINE LOOP ---
+  for (size_t offset = 0; offset < len; offset += batch_size) {
+    int curr_idx = (offset / batch_size) % NUM_SLOTS;
+    int next_idx = (curr_idx + 1) % NUM_SLOTS;
+    auto& curr_slot = slots[curr_idx];
+    auto& next_slot = slots[next_idx];
+    size_t current_batch_len = std::min(batch_size, len - offset);
+
+    // 1. WAIT & RETRIEVE: Get Geometries from Background Task
+    // This will block only if CPU work for this batch is slower than GPU work for
+    // previous batch
+    dev_geometries_t probe_geoms;
+    if (curr_slot->prep_future.valid()) {
+      probe_geoms = std::move(curr_slot->prep_future.get());
+    }
+
+    // 2. KICKOFF NEXT: Start CPU work for Batch (N+1)
+    size_t next_offset = offset + batch_size;
+    if (next_offset < len) {
+      size_t next_len = std::min(batch_size, len - next_offset);
+      next_slot->prep_future = std::async(std::launch::async, prepare_batch_task,
+                                          next_slot.get(), next_offset, next_len);
+    }
+
+    // 3. GPU EXECUTION PHASE
+    const uint32_t* batch_build_ptr = d_build_indices.data() + offset;
+
+    // Copy build indices for this batch
+    curr_slot->d_batch_build_indices.resize(current_batch_len, curr_slot->stream);
+    CUDA_CHECK(cudaMemcpyAsync(curr_slot->d_batch_build_indices.data(), batch_build_ptr,
+                               sizeof(uint32_t) * current_batch_len,
+                               cudaMemcpyHostToDevice, curr_slot->stream));
+
+    // Relate/Refine
+    // Note: Evaluate filters d_batch_build_indices in-place
+    relate_engine.Evaluate(curr_slot->stream, probe_geoms, predicate,
+                           curr_slot->d_batch_build_indices,
+                           curr_slot->indices_map.d_reordered_indices);
+
+    // 4. GATHER & APPEND RESULTS
+    // We need the size to know how much to gather
+    size_t new_size = curr_slot->d_batch_build_indices.size();
+
+    if (new_size > 0) {
+      // Gather original probe indices
+      curr_slot->d_batch_probe_indices.resize(new_size, curr_slot->stream);
+      thrust::gather(rmm::exec_policy_nosync(curr_slot->stream),
+                     curr_slot->indices_map.d_reordered_indices.begin(),
+                     curr_slot->indices_map.d_reordered_indices.end(),
+                     curr_slot->indices_map.d_uniq_indices.begin(),
+                     curr_slot->d_batch_probe_indices.begin());
+
+      // Append to Final Buffers (Device-to-Device Copy)
+      CUDA_CHECK(cudaMemcpyAsync(d_final_build_indices.data() + tail_offset,
+                                 curr_slot->d_batch_build_indices.data(),
+                                 sizeof(uint32_t) * new_size, cudaMemcpyDeviceToDevice,
+                                 curr_slot->stream));
+
+      CUDA_CHECK(cudaMemcpyAsync(d_final_probe_indices.data() + tail_offset,
+                                 curr_slot->d_batch_probe_indices.data(),
+                                 sizeof(uint32_t) * new_size, cudaMemcpyDeviceToDevice,
+                                 curr_slot->stream));
+
+      tail_offset += new_size;
+    }
+  }
+
+  // --- FINALIZATION ---
+
+  // Wait for all streams to finish writing to final buffers
+  for (auto& slot : slots) {
+    slot->stream.synchronize();
+  }
+
+  // Shrink probe vector to actual size for sorting
+  d_final_probe_indices.resize(tail_offset, main_stream);
+  d_final_build_indices.resize(tail_offset, main_stream);
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(main_stream),
+                        d_final_probe_indices.begin(),
+                        d_final_probe_indices.end(),  // Sort only valid range
+                        d_final_build_indices.begin());
+  }
+
+  // Final Copy to Host
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_final_build_indices.data(),
+                             sizeof(uint32_t) * tail_offset, cudaMemcpyDeviceToHost,
+                             main_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_final_probe_indices.data(),
+                             sizeof(uint32_t) * tail_offset, cudaMemcpyDeviceToHost,
+                             main_stream));
+
+  main_stream.synchronize();
+  return tail_offset;
+}
+
+uint32_t RTSpatialRefiner::Refine(const ArrowSchema* build_schema,
+                                  const ArrowArray* build_array,
+                                  const ArrowSchema* probe_schema,
+                                  const ArrowArray* probe_array, Predicate predicate,
+                                  uint32_t* build_indices, uint32_t* probe_indices,
+                                  uint32_t len) {
+  if (len == 0) {
+    return 0;
+  }
+
+  auto cuda_stream = stream_pool_->get_stream();
+  SpatialRefinerContext ctx;
+
+  ctx.cuda_stream = cuda_stream;
+
+  IndicesMap build_indices_map, probe_indices_map;
+  rmm::device_uvector<uint32_t> d_indices(len, cuda_stream);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_indices.data(), build_indices, sizeof(uint32_t) * len,
+                             cudaMemcpyHostToDevice, cuda_stream));
+  buildIndicesMap(cuda_stream, d_indices.begin(), d_indices.end(), build_indices_map);
+
+  CUDA_CHECK(cudaMemcpyAsync(d_indices.data(), probe_indices, sizeof(uint32_t) * len,
+                             cudaMemcpyHostToDevice, cuda_stream));
+  buildIndicesMap(cuda_stream, d_indices.begin(), d_indices.end(), probe_indices_map);
+  d_indices.resize(0, cuda_stream);
+  d_indices.shrink_to_fit(cuda_stream);
+
+  loader_t loader(thread_pool_);
+  loader_t::Config loader_config;
+  loader_config.memory_quota = config_.wkb_parser_memory_quota / config_.concurrency;
+  loader.Init(loader_config);
+  loader.Parse(ctx.cuda_stream, build_schema, build_array,
+               build_indices_map.h_uniq_indices.begin(),
+               build_indices_map.h_uniq_indices.end());
+  auto geoms1 = std::move(loader.Finish(ctx.cuda_stream));
+
+  loader.Clear(ctx.cuda_stream);
+  loader.Parse(ctx.cuda_stream, probe_schema, probe_array,
+               probe_indices_map.h_uniq_indices.begin(),
+               probe_indices_map.h_uniq_indices.end());
+  auto geoms2 = std::move(loader.Finish(ctx.cuda_stream));
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Loaded Geometries, build_array %ld, Loaded %u, Type %s, probe_array %ld, Loaded %u, Type %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, build_array->length,
+      geoms1.num_features(), GeometryTypeToString(geoms1.get_geometry_type()).c_str(),
+      probe_array->length, geoms2.num_features(),
+      GeometryTypeToString(geoms2.get_geometry_type()).c_str());
+
+  RelateEngine<point_t, index_t> relate_engine(&geoms1, config_.rt_engine.get());
+  RelateEngine<point_t, index_t>::Config re_config;
+
+  re_config.memory_quota = config_.relate_engine_memory_quota / config_.concurrency;
+  re_config.bvh_fast_build = config_.prefer_fast_build;
+  re_config.bvh_compact = config_.compact;
+
+  relate_engine.set_config(re_config);
+
+  GPUSPATIAL_LOG_INFO(
+      "RTSpatialRefiner %p (Free %zu MB), Evaluating %u Geometry Pairs with Predicate %s",
+      this, rmm::available_device_memory().first / 1024 / 1024, len,
+      PredicateToString(predicate));
+
+  ctx.timer.start(ctx.cuda_stream);
+
+  relate_engine.Evaluate(ctx.cuda_stream, geoms2, predicate,
+                         build_indices_map.d_reordered_indices,
+                         probe_indices_map.d_reordered_indices);
+  float refine_ms = ctx.timer.stop(ctx.cuda_stream);
+
+  auto new_size = build_indices_map.d_reordered_indices.size();
+  GPUSPATIAL_LOG_INFO("RTSpatialRefiner %p (Free %zu MB), Refine time %f, new size %zu",
+                      this, rmm::available_device_memory().first / 1024 / 1024, refine_ms,
+                      new_size);
+  rmm::device_uvector<uint32_t> d_build_indices(new_size, ctx.cuda_stream);
+  rmm::device_uvector<uint32_t> d_probe_indices(new_size, ctx.cuda_stream);
+
+  thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
+                 build_indices_map.d_reordered_indices.begin(),
+                 build_indices_map.d_reordered_indices.end(),
+                 build_indices_map.d_uniq_indices.begin(), d_build_indices.begin());
+
+  thrust::gather(rmm::exec_policy_nosync(ctx.cuda_stream),
+                 probe_indices_map.d_reordered_indices.begin(),
+                 probe_indices_map.d_reordered_indices.end(),
+                 probe_indices_map.d_uniq_indices.begin(), d_probe_indices.begin());
+
+  if (config_.sort_probe_indices) {
+    thrust::sort_by_key(rmm::exec_policy_nosync(ctx.cuda_stream), d_probe_indices.begin(),
+                        d_probe_indices.end(), d_build_indices.begin());
+  }
+
+  CUDA_CHECK(cudaMemcpyAsync(build_indices, d_build_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+
+  CUDA_CHECK(cudaMemcpyAsync(probe_indices, d_probe_indices.data(),
+                             sizeof(uint32_t) * new_size, cudaMemcpyDeviceToHost,
+                             ctx.cuda_stream));
+  ctx.cuda_stream.synchronize();
+  return new_size;
+}
+
+template <typename INDEX_IT>
+void RTSpatialRefiner::buildIndicesMap(rmm::cuda_stream_view stream, INDEX_IT index_begin,
+                                       INDEX_IT index_end,
+                                       IndicesMap& indices_map) const {
+  auto len = thrust::distance(index_begin, index_end);
+  auto& d_uniq_indices = indices_map.d_uniq_indices;
+  auto& h_uniq_indices = indices_map.h_uniq_indices;
+
+  d_uniq_indices.resize(len, stream);
+  CUDA_CHECK(cudaMemcpyAsync(d_uniq_indices.data(), index_begin, sizeof(uint32_t) * len,
+                             cudaMemcpyDeviceToDevice, stream));
+
+  thrust::sort(rmm::exec_policy_nosync(stream), d_uniq_indices.begin(),
+               d_uniq_indices.end());
+  auto uniq_end = thrust::unique(rmm::exec_policy_nosync(stream), d_uniq_indices.begin(),
+                                 d_uniq_indices.end());
+  auto uniq_size = thrust::distance(d_uniq_indices.begin(), uniq_end);
+
+  d_uniq_indices.resize(uniq_size, stream);
+  h_uniq_indices.resize(uniq_size);
+
+  CUDA_CHECK(cudaMemcpyAsync(h_uniq_indices.data(), d_uniq_indices.data(),
+                             sizeof(uint32_t) * uniq_size, cudaMemcpyDeviceToHost,
+                             stream));
+
+  auto& d_reordered_indices = indices_map.d_reordered_indices;
+
+  d_reordered_indices.resize(len, stream);
+  detail::ReorderIndices(stream, index_begin, index_end, d_uniq_indices,
+                         d_reordered_indices);
+}
+
+std::unique_ptr<SpatialRefiner> CreateRTSpatialRefiner(
+    const RTSpatialRefinerConfig& config) {
+  auto refiner = std::make_unique<RTSpatialRefiner>(config);
+  GPUSPATIAL_LOG_INFO(
+      "Create RTSpatialRefiner %p, fast_build = %d, compact = %d, "
+      "parsing_threads = %u, concurrency = %u, pipeline_batches = %u, "
+      "wkb_parser_memory_quota = %.2f, relate_engine_memory_quota = %.2f",
+      refiner.get(), config.prefer_fast_build, config.compact, config.parsing_threads,
+      config.concurrency, config.pipeline_batches, config.wkb_parser_memory_quota,
+      config.relate_engine_memory_quota);
+  return std::move(refiner);
+}
+
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu b/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu
deleted file mode 100644
index 03aafaa27..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/src/spatial_joiner.cu
+++ /dev/null
@@ -1,483 +0,0 @@
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "gpuspatial/index/detail/launch_parameters.h"
-#include "gpuspatial/index/relate_engine.cuh"
-#include "gpuspatial/index/spatial_joiner.cuh"
-#include "gpuspatial/loader/parallel_wkb_loader.h"
-#include "gpuspatial/utils/logger.hpp"
-#include "gpuspatial/utils/stopwatch.h"
-
-#include "rt/shaders/shader_id.hpp"
-
-#include "rmm/exec_policy.hpp"
-
-#define OPTIX_MAX_RAYS (1lu << 30)
-namespace gpuspatial {
-
-namespace detail {
-
-template <int N_DIM>
-static rmm::device_uvector<OptixAabb> ComputeAABBs(
-    rmm::cuda_stream_view stream, const ArrayView<Box<Point<float, N_DIM>>>& mbrs) {
-  rmm::device_uvector<OptixAabb> aabbs(mbrs.size(), stream);
-
-  thrust::transform(rmm::exec_policy_nosync(stream), mbrs.begin(), mbrs.end(),
-                    aabbs.begin(), [] __device__(const Box<Point<float, N_DIM>>& mbr) {
-                      OptixAabb aabb{0, 0, 0, 0, 0, 0};
-                      auto min_corner = mbr.get_min();
-                      auto max_corner = mbr.get_max();
-                      for (int dim = 0; dim < N_DIM; dim++) {
-                        (&aabb.minX)[dim] = min_corner[dim];
-                        (&aabb.maxX)[dim] = max_corner[dim];
-                      }
-                      return aabb;
-                    });
-  return std::move(aabbs);
-}
-
-}  // namespace detail
-
-void SpatialJoiner::Init(const Config* config) {
-  config_ = *dynamic_cast<const SpatialJoinerConfig*>(config);
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), Initialize, Concurrency %u", this,
-                      rmm::available_device_memory().first / 1024 / 1024,
-                      config_.concurrency);
-  details::RTConfig rt_config = details::get_default_rt_config(config_.ptx_root);
-  rt_engine_.Init(rt_config);
-
-  loader_t::Config loader_config;
-
-  thread_pool_ = std::make_shared<ThreadPool>(config_.parsing_threads);
-  build_loader_ = std::make_unique<loader_t>(thread_pool_);
-  build_loader_->Init(loader_config);
-  stream_pool_ = std::make_unique<rmm::cuda_stream_pool>(config_.concurrency);
-  ctx_pool_ = ObjectPool<SpatialJoinerContext>::create(config_.concurrency);
-  CUDA_CHECK(cudaDeviceSetLimit(cudaLimitStackSize, config_.stack_size_bytes));
-  Clear();
-}
-
-void SpatialJoiner::Clear() {
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), Clear", this,
-                      rmm::available_device_memory().first / 1024 / 1024);
-  bvh_buffer_ = nullptr;
-  geometry_grouper_.Clear();
-  auto stream = rmm::cuda_stream_default;
-  build_loader_->Clear(stream);
-  build_geometries_.Clear(stream);
-  stream.synchronize();
-}
-
-void SpatialJoiner::PushBuild(const ArrowSchema* schema, const ArrowArray* array,
-                              int64_t offset, int64_t length) {
-  GPUSPATIAL_LOG_INFO("SpatialJoiner %p (Free %zu MB), PushBuild, offset %ld, length %ld",
-                      this, rmm::available_device_memory().first / 1024 / 1024, offset,
-                      length);
-  build_loader_->Parse(rmm::cuda_stream_default, array, offset, length);
-}
-
-void SpatialJoiner::FinishBuilding() {
-  auto stream = rmm::cuda_stream_default;
-
-  build_geometries_ = std::move(build_loader_->Finish(stream));
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p (Free %zu MB), FinishBuilding, n_features: %ld, type %s", this,
-      rmm::available_device_memory().first / 1024 / 1024,
-      build_geometries_.num_features(),
-      GeometryTypeToString(build_geometries_.get_geometry_type()));
-
-  if (build_geometries_.get_geometry_type() == GeometryType::kPoint) {
-    geometry_grouper_.Group(stream, build_geometries_, config_.n_points_per_aabb);
-    handle_ = buildBVH(stream, geometry_grouper_.get_aabbs(), bvh_buffer_);
-  } else {
-    auto aabbs = detail::ComputeAABBs(stream, build_geometries_.get_mbrs());
-    handle_ = buildBVH(stream, ArrayView<OptixAabb>(aabbs), bvh_buffer_);
-  }
-
-  relate_engine_ = RelateEngine(&build_geometries_, &rt_engine_);
-  RelateEngine<point_t, index_t>::Config re_config;
-
-  re_config.memory_quota = config_.relate_engine_memory_quota;
-  re_config.bvh_fast_build = config_.prefer_fast_build;
-  re_config.bvh_fast_compact = config_.compact;
-
-  relate_engine_.set_config(re_config);
-}
-
-void SpatialJoiner::PushStream(Context* base_ctx, const ArrowSchema* schema,
-                               const ArrowArray* array, int64_t offset, int64_t length,
-                               Predicate predicate, std::vector<uint32_t>* build_indices,
-                               std::vector<uint32_t>* stream_indices,
-                               int32_t array_index_offset) {
-  auto* ctx = (SpatialJoinerContext*)base_ctx;
-  ctx->cuda_stream = stream_pool_->get_stream();
-
-#ifdef GPUSPATIAL_PROFILING
-  Stopwatch sw;
-  sw.start();
-#endif
-  ctx->array_index_offset = array_index_offset;
-
-  if (ctx->stream_loader == nullptr) {
-    ctx->stream_loader = std::make_unique<loader_t>(thread_pool_);
-    loader_t::Config loader_config;
-
-    ctx->stream_loader->Init(loader_config);
-  }
-  ctx->stream_loader->Parse(ctx->cuda_stream, array, offset, length);
-  ctx->stream_geometries = std::move(ctx->stream_loader->Finish(ctx->cuda_stream));
-
-  auto build_type = build_geometries_.get_geometry_type();
-  auto stream_type = ctx->stream_geometries.get_geometry_type();
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, PushStream, build features %zu, type %s, stream features %zu, type %s",
-      this, build_geometries_.num_features(),
-      GeometryTypeToString(build_geometries_.get_geometry_type()),
-      ctx->stream_geometries.num_features(),
-      GeometryTypeToString(ctx->stream_geometries.get_geometry_type()));
-
-#ifdef GPUSPATIAL_PROFILING
-  sw.stop();
-  ctx->parse_ms += sw.ms();
-#endif
-
-  if (build_type == GeometryType::kPoint) {
-    if (stream_type == GeometryType::kPoint) {
-      handleBuildPointStreamPoint(ctx, predicate, build_indices, stream_indices);
-    } else {
-      handleBuildPointStreamBox(ctx, predicate, build_indices, stream_indices);
-    }
-  } else {
-    if (stream_type == GeometryType::kPoint) {
-      handleBuildBoxStreamPoint(ctx, predicate, build_indices, stream_indices);
-    } else {
-      handleBuildBoxStreamBox(ctx, predicate, build_indices, stream_indices);
-    }
-  }
-#ifdef GPUSPATIAL_PROFILING
-  printf("parse %lf, alloc %lf, filter %lf, refine %lf, copy_res %lf ms\n", ctx->parse_ms,
-         ctx->alloc_ms, ctx->filter_ms, ctx->refine_ms, ctx->copy_res_ms);
-#endif
-}
-
-void SpatialJoiner::handleBuildPointStreamPoint(SpatialJoinerContext* ctx,
-                                                Predicate predicate,
-                                                std::vector<uint32_t>* build_indices,
-                                                std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(ctx->stream_geometries.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  launch_params.grouped = true;
-  launch_params.prefix_sum = geometry_grouper_.get_prefix_sum();
-  launch_params.reordered_indices = geometry_grouper_.get_reordered_indices();
-  launch_params.mbrs1 = ArrayView<box_t>();  // no MBRs for point
-  launch_params.points2 = ctx->stream_geometries.get_points();
-  launch_params.handle = handle_;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-  filter(ctx, dim_x);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildBoxStreamPoint(SpatialJoinerContext* ctx,
-                                              Predicate predicate,
-                                              std::vector<uint32_t>* build_indices,
-                                              std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(ctx->stream_geometries.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  launch_params.grouped = false;
-  launch_params.mbrs1 = build_geometries_.get_mbrs();
-  launch_params.points2 = ctx->stream_geometries.get_points();
-  launch_params.handle = handle_;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-  filter(ctx, dim_x);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildPointStreamBox(SpatialJoinerContext* ctx,
-                                              Predicate predicate,
-                                              std::vector<uint32_t>* build_indices,
-                                              std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  ctx->shader_id = GetPointQueryShaderId<point_t>();
-  assert(build_geometries_.get_geometry_type() == GeometryType::kPoint);
-
-  using launch_params_t = detail::LaunchParamsPointQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  auto aabbs = detail::ComputeAABBs(ctx->cuda_stream, ctx->stream_geometries.get_mbrs());
-  auto handle = buildBVH(ctx->cuda_stream, ArrayView<OptixAabb>(aabbs), ctx->bvh_buffer);
-
-  // mbrs1 are from stream; points2 are from build
-  launch_params.grouped = false;
-  launch_params.mbrs1 = ctx->stream_geometries.get_mbrs();
-  launch_params.points2 = build_geometries_.get_points();
-  launch_params.handle = handle;
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-
-  uint32_t dim_x = std::min(OPTIX_MAX_RAYS, build_geometries_.num_features());
-  // IMPORTANT: In this case, the BVH is built from stream geometries and points2 are
-  // build geometries, so the result pairs are (stream_id, build_id) instead of (build_id,
-  // stream_id). We need to swap the output buffers to correct this.
-  filter(ctx, dim_x, true);
-  refine(ctx, predicate, build_indices, stream_indices);
-}
-
-void SpatialJoiner::handleBuildBoxStreamBox(SpatialJoinerContext* ctx,
-                                            Predicate predicate,
-                                            std::vector<uint32_t>* build_indices,
-                                            std::vector<uint32_t>* stream_indices) {
-  allocateResultBuffer(ctx);
-
-  // forward cast: cast rays from stream geometries with the BVH of build geometries
-  {
-    auto dim_x = std::min(OPTIX_MAX_RAYS, ctx->stream_geometries.num_features());
-
-    prepareLaunchParamsBoxQuery(ctx, true);
-    filter(ctx, dim_x);
-    refine(ctx, predicate, build_indices, stream_indices);
-    ctx->results.Clear(ctx->cuda_stream);  // results have been copied, reuse space
-  }
-  // need allocate again as the previous results buffer has been shrinked to fit
-  allocateResultBuffer(ctx);
-  // backward cast: cast rays from the build geometries with the BVH of stream geometries
-  {
-    auto dim_x = std::min(OPTIX_MAX_RAYS, build_geometries_.num_features());
-    auto v_mbrs = ctx->stream_geometries.get_mbrs();
-    rmm::device_uvector<OptixAabb> aabbs(v_mbrs.size(), ctx->cuda_stream);
-
-    thrust::transform(rmm::exec_policy_nosync(ctx->cuda_stream), v_mbrs.begin(),
-                      v_mbrs.end(), aabbs.begin(),
-                      [] __device__(const box_t& mbr) { return mbr.ToOptixAabb(); });
-
-    // Build a BVH over the MBRs of the stream geometries
-    ctx->handle =
-        buildBVH(ctx->cuda_stream, ArrayView<OptixAabb>(aabbs.data(), aabbs.size()),
-                 ctx->bvh_buffer);
-    prepareLaunchParamsBoxQuery(ctx, false);
-    filter(ctx, dim_x);
-    refine(ctx, predicate, build_indices, stream_indices);
-  }
-}
-
-OptixTraversableHandle SpatialJoiner::buildBVH(
-    const rmm::cuda_stream_view& stream, const ArrayView<OptixAabb>& aabbs,
-    std::unique_ptr<rmm::device_buffer>& buffer) {
-  auto buffer_size_bytes = rt_engine_.EstimateMemoryUsageForAABB(
-      aabbs.size(), config_.prefer_fast_build, config_.compact);
-
-  if (buffer == nullptr || buffer->size() < buffer_size_bytes) {
-    buffer = std::make_unique<rmm::device_buffer>(buffer_size_bytes, stream);
-  }
-
-  return rt_engine_.BuildAccelCustom(stream, aabbs, *buffer, config_.prefer_fast_build,
-                                     config_.compact);
-}
-
-void SpatialJoiner::allocateResultBuffer(SpatialJoinerContext* ctx) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  int64_t avail_bytes = rmm::available_device_memory().first;
-  auto stream_type = ctx->stream_geometries.get_geometry_type();
-  if (stream_type != GeometryType::kPoint) {
-    // need to reserve space for the BVH of stream
-    auto n_aabbs = ctx->stream_geometries.get_mbrs().size();
-
-    avail_bytes -= rt_engine_.EstimateMemoryUsageForAABB(
-        n_aabbs, config_.prefer_fast_build, config_.compact);
-  }
-
-  if (avail_bytes <= 0) {
-    throw std::runtime_error(
-        "Not enough memory to allocate result space for spatial index");
-  }
-
-  uint64_t reserve_bytes = ceil(avail_bytes * config_.result_buffer_memory_reserve_ratio);
-  reserve_bytes = reserve_bytes / config_.concurrency + 1;
-  // two index_t for each result pair (build index, stream index) and another index_t for
-  // the temp storage
-  uint32_t n_items = reserve_bytes / (2 * sizeof(index_t) + sizeof(index_t));
-
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, Allocate result buffer quota %zu MB, queue size %u", this,
-      reserve_bytes / 1024 / 1024, n_items);
-
-  ctx->results.Init(ctx->cuda_stream, n_items);
-  ctx->results.Clear(ctx->cuda_stream);
-#ifdef GPUSPATIAL_PROFILING
-  ctx->alloc_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-}
-
-void SpatialJoiner::prepareLaunchParamsBoxQuery(SpatialJoinerContext* ctx, bool foward) {
-  using launch_params_t = detail::LaunchParamsBoxQuery<point_t>;
-  ctx->launch_params_buffer =
-      std::make_unique<rmm::device_buffer>(sizeof(launch_params_t), ctx->cuda_stream);
-  ctx->h_launch_params_buffer.resize(sizeof(launch_params_t));
-  auto& launch_params = *(launch_params_t*)ctx->h_launch_params_buffer.data();
-
-  assert(ctx->stream_geometries.get_geometry_type() != GeometryType::kPoint);
-
-  launch_params.mbrs1 = build_geometries_.get_mbrs();
-  launch_params.mbrs2 = ctx->stream_geometries.get_mbrs();
-  if (foward) {
-    launch_params.handle = handle_;
-    ctx->shader_id = GetBoxQueryForwardShaderId<point_t>();
-  } else {
-    launch_params.handle = ctx->handle;
-    ctx->shader_id = GetBoxQueryBackwardShaderId<point_t>();
-  }
-
-  launch_params.ids = ctx->results.DeviceObject();
-  CUDA_CHECK(cudaMemcpyAsync(ctx->launch_params_buffer->data(), &launch_params,
-                             sizeof(launch_params_t), cudaMemcpyHostToDevice,
-                             ctx->cuda_stream));
-}
-
-void SpatialJoiner::filter(SpatialJoinerContext* ctx, uint32_t dim_x, bool swap_id) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  Stopwatch sw;
-  sw.start();
-  if (dim_x > 0) {
-    rt_engine_.Render(ctx->cuda_stream, ctx->shader_id, dim3{dim_x, 1, 1},
-                      ArrayView<char>((char*)ctx->launch_params_buffer->data(),
-                                      ctx->launch_params_buffer->size()));
-  }
-  auto result_size = ctx->results.size(ctx->cuda_stream);
-  sw.stop();
-  GPUSPATIAL_LOG_INFO(
-      "SpatialJoiner %p, Filter stage, Launched %u rays, Found %u candidates, time %lf ms",
-      this, dim_x, result_size, sw.ms());
-  if (swap_id && result_size > 0) {
-    // swap the pair (build_id, stream_id) to (stream_id, build_id)
-    thrust::for_each(rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-                     ctx->results.data() + result_size,
-                     [] __device__(thrust::pair<uint32_t, uint32_t> & pair) {
-                       thrust::swap(pair.first, pair.second);
-                     });
-  }
-  ctx->results.shrink_to_fit(ctx->cuda_stream);
-
-#ifdef GPUSPATIAL_PROFILING
-  ctx->filter_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-}
-
-void SpatialJoiner::refine(SpatialJoinerContext* ctx, Predicate predicate,
-                           std::vector<uint32_t>* build_indices,
-                           std::vector<uint32_t>* stream_indices) {
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  relate_engine_.Evaluate(ctx->cuda_stream, ctx->stream_geometries, predicate,
-                          ctx->results);
-#ifdef GPUSPATIAL_PROFILING
-  ctx->refine_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-  auto n_results = ctx->results.size(ctx->cuda_stream);
-
-#ifdef GPUSPATIAL_PROFILING
-  ctx->timer.start(ctx->cuda_stream);
-#endif
-  rmm::device_uvector<uint32_t> tmp_result_buffer(n_results, ctx->cuda_stream);
-
-  thrust::transform(
-      rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-      ctx->results.data() + n_results, tmp_result_buffer.begin(),
-      [] __device__(const thrust::pair<index_t, index_t>& pair) -> uint32_t {
-        return pair.first;
-      });
-  auto prev_size = build_indices->size();
-  build_indices->resize(build_indices->size() + n_results);
-
-  CUDA_CHECK(cudaMemcpyAsync(build_indices->data() + prev_size, tmp_result_buffer.data(),
-                             sizeof(uint32_t) * n_results, cudaMemcpyDeviceToHost,
-                             ctx->cuda_stream));
-
-  auto array_index_offset = ctx->array_index_offset;
-
-  thrust::transform(
-      rmm::exec_policy_nosync(ctx->cuda_stream), ctx->results.data(),
-      ctx->results.data() + n_results, tmp_result_buffer.begin(),
-      [=] __device__(const thrust::pair<index_t, index_t>& pair) -> uint32_t {
-        return pair.second + array_index_offset;
-      });
-
-  stream_indices->resize(stream_indices->size() + n_results);
-
-  CUDA_CHECK(cudaMemcpyAsync(stream_indices->data() + prev_size, tmp_result_buffer.data(),
-                             sizeof(uint32_t) * n_results, cudaMemcpyDeviceToHost,
-                             ctx->cuda_stream));
-#ifdef GPUSPATIAL_PROFILING
-  ctx->copy_res_ms += ctx->timer.stop(ctx->cuda_stream);
-#endif
-  ctx->cuda_stream.synchronize();
-}
-
-std::unique_ptr<StreamingJoiner> CreateSpatialJoiner() {
-  return std::make_unique<SpatialJoiner>();
-}
-
-void InitSpatialJoiner(StreamingJoiner* index, const char* ptx_root,
-                       uint32_t concurrency) {
-  SpatialJoiner::SpatialJoinerConfig config;
-  config.ptx_root = ptx_root;
-  config.concurrency = concurrency;
-  index->Init(&config);
-}
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt b/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
index 719d0909f..bcf69239f 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/CMakeLists.txt
@@ -53,8 +53,19 @@ if(GPUSPATIAL_BUILD_TESTS)
                          PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
                                  --expt-relaxed-constexpr>)
 
-  add_executable(joiner_test main.cc array_stream.cc joiner_test.cu)
-  target_link_libraries(joiner_test
+  add_executable(index_test main.cc index_test.cu)
+  target_link_libraries(index_test
+                        PRIVATE cuda
+                                GTest::gtest_main
+                                GTest::gmock_main
+                                gpuspatial
+                                GEOS::geos
+                                GEOS::geos_c)
+  target_compile_options(index_test
+                         PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
+                                 --expt-relaxed-constexpr>)
+  add_executable(refiner_test main.cc array_stream.cc refiner_test.cu)
+  target_link_libraries(refiner_test
                         PRIVATE cuda
                                 GTest::gtest_main
                                 GTest::gmock_main
@@ -65,7 +76,7 @@ if(GPUSPATIAL_BUILD_TESTS)
                                 Arrow::arrow_static
                                 Parquet::parquet_static
                                 nanoarrow::nanoarrow_ipc)
-  target_compile_options(joiner_test
+  target_compile_options(refiner_test
                          PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda
                                  --expt-relaxed-constexpr>)
 
@@ -83,14 +94,19 @@ if(GPUSPATIAL_BUILD_TESTS)
                                  --expt-relaxed-constexpr>)
 
   add_executable(c_wrapper_test main.cc c_wrapper_test.cc array_stream.cc)
-  target_link_libraries(c_wrapper_test PRIVATE GTest::gtest_main GTest::gmock_main
-                                               gpuspatial_c nanoarrow::nanoarrow_ipc)
+  target_link_libraries(c_wrapper_test
+                        PRIVATE GTest::gtest_main
+                                GTest::gmock_main
+                                gpuspatial_c
+                                GEOS::geos
+                                GEOS::geos_c
+                                geoarrow_geos
+                                nanoarrow::nanoarrow_ipc)
 
   include(GoogleTest)
 
   gtest_discover_tests(gpuspatial_testing_test)
   gtest_discover_tests(array_stream_test)
   gtest_discover_tests(loader_test)
-  gtest_discover_tests(joiner_test)
   gtest_discover_tests(relate_test)
 endif()
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc b/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
index 60c247399..269c03898 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/c_wrapper_test.cc
@@ -24,40 +24,133 @@
 #include <random>
 #include <vector>
 #include "array_stream.hpp"
+#include "geoarrow_geos/geoarrow_geos.hpp"
 #include "nanoarrow/nanoarrow.hpp"
 
-namespace TestUtils {
-std::string GetTestDataPath(const std::string& relative_path_to_file);
+TEST(RuntimeTest, InitializeRuntime) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  config.ptx_root = ptx_root.c_str();
+  config.device_id = 0;
+  config.cuda_init_memory_pool_ratio = 0;
+  ASSERT_EQ(runtime.init(&runtime, &config), 0);
+
+  runtime.release(&runtime);
+}
+
+TEST(RuntimeTest, ErrorTest) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  runtime_config.ptx_root = "/invalid/path/to/ptx";
+  runtime_config.device_id = 0;
+  runtime_config.cuda_init_memory_pool_ratio = 0;
+
+  EXPECT_NE(runtime.init(&runtime, &runtime_config), 0);
+
+  const char* raw_error = runtime.get_last_error(&runtime);
+  printf("Error received: %s\n", raw_error);
+
+  std::string error_msg(raw_error);
+
+  EXPECT_NE(error_msg.find("No such file or directory"), std::string::npos)
+      << "Error message was corrupted or incorrect. Got: " << error_msg;
+
+  runtime.release(&runtime);
+}
+
+TEST(SpatialIndexTest, InitializeIndex) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  runtime_config.ptx_root = ptx_root.c_str();
+  runtime_config.device_id = 0;
+  runtime_config.cuda_init_memory_pool_ratio = 0.1;
+  ASSERT_EQ(runtime.init(&runtime, &runtime_config), 0);
+
+  SedonaFloatIndex2D index;
+  GpuSpatialIndexConfig index_config;
+
+  index_config.runtime = &runtime;
+  index_config.concurrency = 1;
+
+  ASSERT_EQ(GpuSpatialIndexFloat2DCreate(&index, &index_config), 0);
+
+  index.release(&index);
+  runtime.release(&runtime);
+}
+
+TEST(RefinerTest, InitializeRefiner) {
+  GpuSpatialRuntime runtime;
+  GpuSpatialRuntimeCreate(&runtime);
+  GpuSpatialRuntimeConfig runtime_config;
+
+  std::string ptx_root = TestUtils::GetTestShaderPath();
+  runtime_config.ptx_root = ptx_root.c_str();
+  runtime_config.device_id = 0;
+  runtime_config.cuda_init_memory_pool_ratio = 0.1;
+  ASSERT_EQ(runtime.init(&runtime, &runtime_config), 0);
+
+  SedonaSpatialRefiner refiner;
+  GpuSpatialRefinerConfig refiner_config;
+
+  refiner_config.runtime = &runtime;
+  refiner_config.concurrency = 1;
+
+  ASSERT_EQ(GpuSpatialRefinerCreate(&refiner, &refiner_config), 0);
+
+  refiner.release(&refiner);
+  runtime.release(&runtime);
 }
 
 class CWrapperTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    // Initialize the GpuSpatialJoiner
-    GpuSpatialJoinerCreate(&joiner_);
-    struct GpuSpatialJoinerConfig config_;
-    std::string ptx_root = TestUtils::GetTestDataPath("shaders_ptx");
+    std::string ptx_root = TestUtils::GetTestShaderPath();
+
+    GpuSpatialRuntimeCreate(&runtime_);
+    GpuSpatialRuntimeConfig runtime_config;
+
+    runtime_config.ptx_root = ptx_root.c_str();
+    runtime_config.device_id = 0;
+    runtime_config.cuda_init_memory_pool_ratio = 0.1;
+    ASSERT_EQ(runtime_.init(&runtime_, &runtime_config), 0);
+
+    GpuSpatialIndexConfig index_config;
+
+    index_config.runtime = &runtime_;
+    index_config.concurrency = 1;
+
+    ASSERT_EQ(GpuSpatialIndexFloat2DCreate(&index_, &index_config), 0);
 
-    // Set up the configuration
-    config_.concurrency = 2;  // Example concurrency level
-    config_.ptx_root = ptx_root.c_str();
+    GpuSpatialRefinerConfig refiner_config;
 
-    ASSERT_EQ(joiner_.init(&joiner_, &config_), 0);
-    // Initialize the context
+    refiner_config.runtime = &runtime_;
+    refiner_config.concurrency = 1;
+
+    ASSERT_EQ(GpuSpatialRefinerCreate(&refiner_, &refiner_config), 0);
   }
 
   void TearDown() override {
-    // Clean up
-    joiner_.release(&joiner_);
+    refiner_.release(&refiner_);
+    index_.release(&index_);
+    runtime_.release(&runtime_);
   }
-
-  struct GpuSpatialJoiner joiner_;
+  GpuSpatialRuntime runtime_;
+  SedonaFloatIndex2D index_;
+  SedonaSpatialRefiner refiner_;
 };
 
 TEST_F(CWrapperTest, InitializeJoiner) {
+  using fpoint_t = gpuspatial::Point<float, 2>;
+  using box_t = gpuspatial::Box<fpoint_t>;
   // Test if the joiner initializes correctly
-  struct GpuSpatialJoinerContext context_;
-  joiner_.create_context(&joiner_, &context_);
 
   auto poly_path = TestUtils::GetTestDataPath("arrowipc/test_polygons.arrows");
   auto point_path = TestUtils::GetTestDataPath("arrowipc/test_points.arrows");
@@ -73,6 +166,8 @@ TEST_F(CWrapperTest, InitializeJoiner) {
 
   int n_row_groups = 100;
 
+  geoarrow::geos::ArrayReader reader;
+
   for (int i = 0; i < n_row_groups; i++) {
     ASSERT_EQ(ArrowArrayStreamGetNext(poly_stream.get(), build_array.get(), &error),
               NANOARROW_OK);
@@ -84,23 +179,138 @@ TEST_F(CWrapperTest, InitializeJoiner) {
     ASSERT_EQ(ArrowArrayStreamGetSchema(point_stream.get(), stream_schema.get(), &error),
               NANOARROW_OK);
 
-    joiner_.push_build(&joiner_, build_schema.get(), build_array.get(), 0,
-                       build_array->length);
-    joiner_.finish_building(&joiner_);
+    class GEOSCppHandle {
+     public:
+      GEOSContextHandle_t handle;
+
+      GEOSCppHandle() { handle = GEOS_init_r(); }
+
+      ~GEOSCppHandle() { GEOS_finish_r(handle); }
+    };
+    GEOSCppHandle handle;
+
+    reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
+
+    geoarrow::geos::GeometryVector geom_build(handle.handle);
+
+    geom_build.resize(build_array->length);
+    size_t n_build;
+
+    ASSERT_EQ(reader.Read(build_array.get(), 0, build_array->length,
+                          geom_build.mutable_data(), &n_build),
+              GEOARROW_GEOS_OK);
+    auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
+    std::vector<box_t> rects;
+
+    for (size_t build_idx = 0; build_idx < build_array->length; build_idx++) {
+      auto* geom = geom_build.borrow(build_idx);
+      auto* box = GEOSEnvelope_r(handle.handle, geom);
+
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, box, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+
+      rects.push_back(bbox);
+
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)build_idx);
+      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
+      GEOSGeom_destroy_r(handle.handle, box);
+    }
+
+    index_.clear(&index_);
+    ASSERT_EQ(index_.push_build(&index_, (float*)rects.data(), rects.size()), 0);
+    ASSERT_EQ(index_.finish_building(&index_), 0);
 
-    joiner_.push_stream(&joiner_, &context_, stream_schema.get(), stream_array.get(), 0,
-                        stream_array->length, GpuSpatialPredicateContains, 0);
+    geoarrow::geos::GeometryVector geom_stream(handle.handle);
+    size_t n_stream;
+    geom_stream.resize(stream_array->length);
 
-    void* build_indices_ptr;
-    void* stream_indices_ptr;
+    ASSERT_EQ(reader.Read(stream_array.get(), 0, stream_array->length,
+                          geom_stream.mutable_data(), &n_stream),
+              GEOARROW_GEOS_OK);
+
+    std::vector<box_t> queries;
+
+    for (size_t stream_idx = 0; stream_idx < stream_array->length; stream_idx++) {
+      auto* geom = geom_stream.borrow(stream_idx);
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, geom, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      queries.push_back(bbox);
+    }
+
+    SedonaSpatialIndexContext idx_ctx;
+    index_.create_context(&idx_ctx);
+
+    index_.probe(&index_, &idx_ctx, (float*)queries.data(), queries.size());
+
+    uint32_t* build_indices_ptr;
+    uint32_t* probe_indices_ptr;
     uint32_t build_indices_length;
-    uint32_t stream_indices_length;
+    uint32_t probe_indices_length;
 
-    joiner_.get_build_indices_buffer(&context_, (void**)&build_indices_ptr,
-                                     &build_indices_length);
-    joiner_.get_stream_indices_buffer(&context_, (void**)&stream_indices_ptr,
-                                      &stream_indices_length);
-  }
+    index_.get_build_indices_buffer(&idx_ctx, &build_indices_ptr, &build_indices_length);
+    index_.get_probe_indices_buffer(&idx_ctx, &probe_indices_ptr, &probe_indices_length);
+
+    uint32_t new_len;
+    ASSERT_EQ(
+        refiner_.refine(&refiner_, build_schema.get(), build_array.get(),
+                        stream_schema.get(), stream_array.get(),
+                        SedonaSpatialRelationPredicate::SedonaSpatialPredicateContains,
+                        (uint32_t*)build_indices_ptr, (uint32_t*)probe_indices_ptr,
+                        build_indices_length, &new_len),
+        0);
 
-  joiner_.destroy_context(&context_);
+    std::vector<uint32_t> build_indices((uint32_t*)build_indices_ptr,
+                                        (uint32_t*)build_indices_ptr + new_len);
+    std::vector<uint32_t> probe_indices((uint32_t*)probe_indices_ptr,
+                                        (uint32_t*)probe_indices_ptr + new_len);
+
+    struct Payload {
+      GEOSContextHandle_t handle;
+      const GEOSGeometry* geom;
+      std::vector<uint32_t> build_indices;
+      std::vector<uint32_t> stream_indices;
+      SedonaSpatialRelationPredicate predicate;
+    };
+
+    Payload payload;
+    payload.predicate = SedonaSpatialRelationPredicate::SedonaSpatialPredicateContains;
+    payload.handle = handle.handle;
+
+    for (size_t offset = 0; offset < n_stream; offset++) {
+      auto* geom = geom_stream.borrow(offset);
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      payload.geom = geom;
+
+      GEOSSTRtree_query_r(
+          handle.handle, tree, geom,
+          [](void* item, void* data) {
+            auto* geom_build = (GEOSGeometry*)item;
+            auto* payload = (Payload*)data;
+            auto* geom_stream = payload->geom;
+
+            if (GEOSContains_r(payload->handle, geom_build, geom_stream) == 1) {
+              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
+              auto stream_id =
+                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
+              payload->build_indices.push_back(build_id);
+              payload->stream_indices.push_back(stream_id);
+            }
+          },
+          (void*)&payload);
+    }
+
+    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
+    ASSERT_EQ(payload.stream_indices.size(), probe_indices.size());
+    TestUtils::sort_vectors_by_index(payload.build_indices, payload.stream_indices);
+    TestUtils::sort_vectors_by_index(build_indices, probe_indices);
+    for (size_t j = 0; j < build_indices.size(); j++) {
+      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
+      ASSERT_EQ(payload.stream_indices[j], probe_indices[j]);
+    }
+    index_.destroy_context(&idx_ctx);
+  }
 }
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
index 5b04c384b..ac2eb06d8 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/Makefile
@@ -19,7 +19,7 @@ URL := https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-e
 INPUT_FILE := natural-earth_cities_geo.parquet
 PYTHON_SCRIPT := ../gen_points.py
 OUTPUT_POINTS := generated_points.parquet
-NUM_POINTS := 1000
+NUM_POINTS := 10000
 
 .PHONY: all clean generate
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet
index 4ad348b3a..024547360 100644
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet and b/c/sedona-libgpuspatial/libgpuspatial/test/data/cities/generated_points.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
index 147a332bd..f154c4416 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/Makefile
@@ -19,7 +19,7 @@ URL := https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/natural-e
 INPUT_FILE := natural-earth_countries_geo.parquet
 PYTHON_SCRIPT := ../gen_points.py
 OUTPUT_POINTS := generated_points.parquet
-NUM_POINTS := 1000
+NUM_POINTS := 10000
 
 .PHONY: all clean generate
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet
index 32d8dcc27..70af40443 100644
Binary files a/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet and b/c/sedona-libgpuspatial/libgpuspatial/test/data/countries/generated_points.parquet differ
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py b/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
index a02f4a094..b23a89ebc 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/data/gen_points.py
@@ -47,7 +47,7 @@ def calculate_bbox_and_generate_points(geoparquet_path, n_points, output_path):
 
     # Generate random coordinates
     random_x = np.random.uniform(minx, maxx, n_points)
-    random_y = np.random.uniform(miny, miny, n_points)
+    random_y = np.random.uniform(miny, maxy, n_points)
 
     # 4. Create a GeoDataFrame from the points
 
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu
new file mode 100644
index 000000000..21d407233
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/index_test.cu
@@ -0,0 +1,299 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "array_stream.hpp"
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "test_common.hpp"
+
+#include <geos/geom/Envelope.h>
+#include <geos/index/ItemVisitor.h>
+#include <geos/index/strtree/STRtree.h>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <numeric>  // For std::iota
+#include <random>
+#include <vector>
+
+namespace gpuspatial {
+template <typename T>
+struct SpatialIndexTest : public ::testing::Test {
+  using index_t = RTSpatialIndex<typename T::scalar_t, T::n_dim>;
+  std::shared_ptr<RTEngine> rt_engine;
+  index_t index;
+
+  SpatialIndexTest() {
+    auto ptx_root = TestUtils::GetTestShaderPath();
+    rt_engine = std::make_shared<RTEngine>();
+    rt_engine->Init(get_default_rt_config(ptx_root));
+    RTSpatialIndexConfig config;
+    config.rt_engine = rt_engine;
+    index = std::move(index_t(config));
+  }
+};
+using PointTypes = ::testing::Types<Point<float, 2>, Point<double, 2>>;
+TYPED_TEST_SUITE(SpatialIndexTest, PointTypes);
+
+template <typename POINT_T>
+std::vector<Box<POINT_T>> GeneratePoints(size_t n, std::mt19937& rng) {
+  using scalar_t = typename POINT_T::scalar_t;
+  std::vector<Box<POINT_T>> rects(n);
+
+  for (size_t i = 0; i < n; i++) {
+    POINT_T p;
+    for (int dim = 0; dim < POINT_T::n_dim; dim++) {
+      std::uniform_real_distribution<scalar_t> dist(-180.0, 180.0);
+      p.set_coordinate(dim, dist(rng));
+    }
+    rects[i] = Box<POINT_T>(p, p);
+  }
+  return rects;
+}
+
+template <typename POINT_T>
+std::vector<Box<POINT_T>> GenerateRects(size_t n, std::mt19937& rng) {
+  using scalar_t = typename POINT_T::scalar_t;
+  std::vector<Box<POINT_T>> rects(n);
+  std::uniform_real_distribution<scalar_t> distSize(0.0, 100);
+
+  for (size_t i = 0; i < n; ++i) {
+    POINT_T min_pt, max_pt, size_pt;
+
+    for (int dim = 0; dim < POINT_T::n_dim; dim++) {
+      std::uniform_real_distribution<scalar_t> dist(-180.0, 180.0);
+      min_pt.set_coordinate(dim, dist(rng));
+      size_pt.set_coordinate(dim, distSize(rng));
+    }
+    max_pt = min_pt + size_pt;
+    rects[i] = Box<POINT_T>(min_pt, max_pt);
+  }
+  return rects;
+}
+
+template <typename POINT_T>
+void ComputeReference(const std::vector<Box<POINT_T>>& build,
+                      const std::vector<Box<POINT_T>>& probe,
+                      std::vector<uint32_t>& build_indices,
+                      std::vector<uint32_t>& probe_indices) {
+  geos::index::strtree::STRtree tree;
+
+  // FIX: Create a storage container for envelopes that persists
+  // for the lifetime of the tree usage.
+  std::vector<geos::geom::Envelope> build_envelopes;
+  build_envelopes.reserve(build.size());
+
+  // 2. Build Phase
+  for (uint32_t j = 0; j < build.size(); j++) {
+    auto min_corner = build[j].get_min();
+    auto max_corner = build[j].get_max();
+
+    // Emplace the envelope into our persistent vector
+    build_envelopes.emplace_back(min_corner.x(), max_corner.x(), min_corner.y(),
+                                 max_corner.y());
+
+    // Pass the address of the element inside the vector
+    // Note: We reserved memory above, so pointers shouldn't be invalidated by resizing
+    tree.insert(&build_envelopes.back(),
+                reinterpret_cast<void*>(static_cast<uintptr_t>(j)));
+  }
+
+  tree.build();
+
+  // 3. Define Visitor (No changes needed here)
+  class InteractionVisitor : public geos::index::ItemVisitor {
+   public:
+    const std::vector<Box<POINT_T>>* build;
+    const std::vector<Box<POINT_T>>* probe;
+    std::vector<uint32_t>* b_indices;
+    std::vector<uint32_t>* p_indices;
+    uint32_t current_probe_idx;
+
+    void visitItem(void* item) override {
+      uintptr_t build_idx_ptr = reinterpret_cast<uintptr_t>(item);
+      uint32_t build_idx = static_cast<uint32_t>(build_idx_ptr);
+
+      // Refinement step
+      if ((*build)[build_idx].intersects((*probe)[current_probe_idx])) {
+        b_indices->push_back(build_idx);
+        p_indices->push_back(current_probe_idx);
+      }
+    }
+  };
+
+  InteractionVisitor visitor;
+  visitor.build = &build;
+  visitor.probe = &probe;
+  visitor.b_indices = &build_indices;
+  visitor.p_indices = &probe_indices;
+
+  // 4. Probe Phase
+  for (uint32_t i = 0; i < probe.size(); i++) {
+    auto min_corner = probe[i].get_min();
+    auto max_corner = probe[i].get_max();
+
+    // It is safe to create this on the stack here because `query`
+    // finishes executing before `search_env` goes out of scope.
+    geos::geom::Envelope search_env(min_corner.x(), max_corner.x(), min_corner.y(),
+                                    max_corner.y());
+
+    visitor.current_probe_idx = i;
+    tree.query(&search_env, visitor);
+  }
+}
+
+template <typename T, typename U>
+void sort_vectors(std::vector<T>& v1, std::vector<U>& v2) {
+  if (v1.size() != v2.size()) return;
+
+  // 1. Create indices [0, 1, 2, ..., N-1]
+  std::vector<size_t> p(v1.size());
+  std::iota(p.begin(), p.end(), 0);
+
+  // 2. Sort indices based on comparing values in v1 and v2
+  std::sort(p.begin(), p.end(), [&](size_t i, size_t j) {
+    if (v1[i] != v1[j]) return v1[i] < v1[j];  // Primary sort by v1
+    return v2[i] < v2[j];                      // Secondary sort by v2
+  });
+
+  // 3. Apply permutation (Reorder v1 and v2 based on sorted indices)
+  // Note: Doing this in-place with O(1) space is complex;
+  // using auxiliary O(N) space is standard.
+  std::vector<T> sorted_v1, sorted_v2;
+  sorted_v1.reserve(v1.size());
+  sorted_v2.reserve(v2.size());
+
+  for (size_t i : p) {
+    sorted_v1.push_back(v1[i]);
+    sorted_v2.push_back(v2[i]);
+  }
+
+  v1 = std::move(sorted_v1);
+  v2 = std::move(sorted_v2);
+}
+
+TYPED_TEST(SpatialIndexTest, PointPoint) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto points1 = GeneratePoints<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(points1.data(), points1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto points2 = GeneratePoints<point_t>(j, gen);
+
+      size_t count = static_cast<size_t>(points1.size() * 0.2);
+
+      // 2. Define the starting point (the last 'count' elements)
+      auto start_it = points1.end() - count;
+
+      // 3. Append to the second vector
+      points2.insert(points2.end(), start_it, points1.end());
+
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(points2.data(), points2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(points1, points2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, BoxPoint) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto rects1 = GenerateRects<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(rects1.data(), rects1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto points2 = GeneratePoints<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(points2.data(), points2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(rects1, points2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, PointBox) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto points1 = GeneratePoints<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(points1.data(), points1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto rects2 = GenerateRects<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(rects2.data(), rects2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(points1, rects2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+
+TYPED_TEST(SpatialIndexTest, BoxBox) {
+  using point_t = TypeParam;
+  std::mt19937 gen(0);
+
+  for (int i = 1; i <= 10000; i *= 2) {
+    auto rects1 = GenerateRects<point_t>(i, gen);
+    this->index.Clear();
+    this->index.PushBuild(rects1.data(), rects1.size());
+    this->index.FinishBuilding();
+
+    for (int j = 1; j <= 10000; j *= 2) {
+      auto rects2 = GenerateRects<point_t>(j, gen);
+      std::vector<uint32_t> build_indices, probe_indices;
+      this->index.Probe(rects2.data(), rects2.size(), &build_indices, &probe_indices);
+      sort_vectors(build_indices, probe_indices);
+
+      std::vector<uint32_t> ref_build_indices, ref_probe_indices;
+      ComputeReference(rects1, rects2, ref_build_indices, ref_probe_indices);
+      sort_vectors(ref_build_indices, ref_probe_indices);
+
+      ASSERT_EQ(build_indices, ref_build_indices);
+      ASSERT_EQ(probe_indices, ref_probe_indices);
+    }
+  }
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu
deleted file mode 100644
index bbf415592..000000000
--- a/c/sedona-libgpuspatial/libgpuspatial/test/joiner_test.cu
+++ /dev/null
@@ -1,438 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#include "array_stream.hpp"
-#include "gpuspatial/index/spatial_joiner.cuh"
-#include "gpuspatial/loader/device_geometries.cuh"
-#include "test_common.hpp"
-
-#include "geoarrow_geos/geoarrow_geos.hpp"
-#include "nanoarrow/nanoarrow.hpp"
-
-#include <geoarrow/geoarrow.h>
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include <numeric>  // For std::iota
-
-namespace gpuspatial {
-// Function to read a single Parquet file and extract a column.
-static arrow::Status ReadParquetFromFile(
-    arrow::fs::FileSystem* fs,     // 1. Filesystem pointer (e.g., LocalFileSystem)
-    const std::string& file_path,  // 2. Single file path instead of a folder
-    int64_t batch_size, const char* column_name,
-    std::vector<std::shared_ptr<arrow::Array>>& out_arrays) {
-  // 1. Get FileInfo for the single path
-  ARROW_ASSIGN_OR_RAISE(auto file_info, fs->GetFileInfo(file_path));
-
-  // Check if the path points to a file
-  if (file_info.type() != arrow::fs::FileType::File) {
-    return arrow::Status::Invalid("Path is not a file: ", file_path);
-  }
-
-  std::cout << "--- Processing Parquet file: " << file_path << " ---" << std::endl;
-
-  // 2. Open the input file
-  ARROW_ASSIGN_OR_RAISE(auto input_file, fs->OpenInputFile(file_info));
-
-  // 3. Open the Parquet file and create an Arrow reader
-  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, parquet::arrow::OpenFile(
-                                               input_file, arrow::default_memory_pool()));
-
-  // 4. Set the batch size
-  arrow_reader->set_batch_size(batch_size);
-
-  // 5. Get the RecordBatchReader
-  auto rb_reader = arrow_reader->GetRecordBatchReader().ValueOrDie();
-  // 6. Read all record batches and extract the column
-  while (true) {
-    std::shared_ptr<arrow::RecordBatch> batch;
-
-    // Read the next batch
-    ARROW_THROW_NOT_OK(rb_reader->ReadNext(&batch));
-
-    // Check for end of stream
-    if (!batch) {
-      break;
-    }
-
-    // Extract the specified column and add to the output vector
-    std::shared_ptr<arrow::Array> column_array = batch->GetColumnByName(column_name);
-    if (!column_array) {
-      return arrow::Status::Invalid("Column not found: ", column_name);
-    }
-    out_arrays.push_back(column_array);
-  }
-
-  std::cout << "Finished reading. Total arrays extracted: " << out_arrays.size()
-            << std::endl;
-  return arrow::Status::OK();
-}
-
-using GeosBinaryPredicateFn = char (*)(GEOSContextHandle_t, const GEOSGeometry*,
-                                       const GEOSGeometry*);
-static GeosBinaryPredicateFn GetGeosPredicateFn(Predicate predicate) {
-  switch (predicate) {
-    case Predicate::kContains:
-      return &GEOSContains_r;
-    case Predicate::kIntersects:
-      return &GEOSIntersects_r;
-    case Predicate::kWithin:
-      return &GEOSWithin_r;
-    case Predicate::kEquals:
-      return &GEOSEquals_r;
-    case Predicate::kTouches:
-      return &GEOSTouches_r;
-    default:
-      throw std::out_of_range("Unsupported GEOS predicate enumeration value.");
-  }
-}
-
-void TestJoiner(const std::string& build_parquet_path,
-                const std::string& stream_parquet_path, Predicate predicate,
-                int batch_size = 10) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-  SpatialJoiner::SpatialJoinerConfig config;
-  std::string ptx_root = TestUtils::GetTestShaderPath();
-
-  config.ptx_root = ptx_root.c_str();
-  SpatialJoiner spatial_joiner;
-
-  spatial_joiner.Init(&config);
-  spatial_joiner.Clear();
-
-  geoarrow::geos::ArrayReader reader;
-
-  class GEOSCppHandle {
-   public:
-    GEOSContextHandle_t handle;
-
-    GEOSCppHandle() { handle = GEOS_init_r(); }
-
-    ~GEOSCppHandle() { GEOS_finish_r(handle); }
-  };
-  GEOSCppHandle handle;
-
-  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
-
-  geoarrow::geos::GeometryVector geom_build(handle.handle);
-
-  auto get_total_length = [](const std::vector<std::shared_ptr<arrow::Array>>& arrays) {
-    size_t total_length = 0;
-    for (const auto& array : arrays) {
-      total_length += array->length();
-    }
-    return total_length;
-  };
-
-  std::vector<std::shared_ptr<arrow::Array>> build_arrays;
-  ARROW_THROW_NOT_OK(ReadParquetFromFile(fs.get(), build_parquet_path, batch_size,
-                                         "geometry", build_arrays));
-
-  // Using GEOS for reference
-  geom_build.resize(get_total_length(build_arrays));
-  size_t tail_build = 0;
-  auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
-
-  for (auto& array : build_arrays) {
-    nanoarrow::UniqueArray unique_array;
-    nanoarrow::UniqueSchema unique_schema;
-
-    ARROW_THROW_NOT_OK(
-        arrow::ExportArray(*array, unique_array.get(), unique_schema.get()));
-
-    spatial_joiner.PushBuild(unique_schema.get(), unique_array.get(), 0,
-                             unique_array->length);
-
-    // geos for reference
-    size_t n_build;
-
-    ASSERT_EQ(reader.Read(unique_array.get(), 0, unique_array->length,
-                          geom_build.mutable_data() + tail_build, &n_build),
-              GEOARROW_GEOS_OK);
-
-    for (size_t offset = tail_build; offset < tail_build + n_build; offset++) {
-      auto* geom = geom_build.borrow(offset);
-      auto* box = GEOSEnvelope_r(handle.handle, geom);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
-      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
-      GEOSGeom_destroy_r(handle.handle, box);
-    }
-    tail_build += n_build;
-  }
-  spatial_joiner.FinishBuilding();
-  ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
-
-  std::vector<std::shared_ptr<arrow::Array>> stream_arrays;
-  ARROW_THROW_NOT_OK(ReadParquetFromFile(
-      fs.get(), stream_parquet_path, batch_size, "geometry", stream_arrays));
-  int array_index_offset = 0;
-  auto context = spatial_joiner.CreateContext();
-
-  for (auto& array : stream_arrays) {
-    nanoarrow::UniqueArray unique_array;
-    nanoarrow::UniqueSchema unique_schema;
-
-    ARROW_THROW_NOT_OK(
-        arrow::ExportArray(*array, unique_array.get(), unique_schema.get()));
-    std::vector<uint32_t> build_indices, stream_indices;
-
-    spatial_joiner.PushStream(context.get(), unique_schema.get(), unique_array.get(), 0,
-                              unique_array->length, predicate, &build_indices,
-                              &stream_indices, array_index_offset);
-
-    geoarrow::geos::GeometryVector geom_stream(handle.handle);
-    size_t n_stream;
-    geom_stream.resize(array->length());
-    ASSERT_EQ(reader.Read(unique_array.get(), 0, unique_array->length,
-                          geom_stream.mutable_data(), &n_stream),
-              GEOARROW_GEOS_OK);
-    struct Payload {
-      GEOSContextHandle_t handle;
-      const GEOSGeometry* geom;
-      int64_t stream_index_offset;
-      std::vector<uint32_t> build_indices;
-      std::vector<uint32_t> stream_indices;
-      Predicate predicate;
-    };
-
-    Payload payload;
-    payload.predicate = predicate;
-    payload.handle = handle.handle;
-
-    payload.stream_index_offset = array_index_offset;
-
-    for (size_t offset = 0; offset < n_stream; offset++) {
-      auto* geom = geom_stream.borrow(offset);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
-      payload.geom = geom;
-
-      GEOSSTRtree_query_r(
-          handle.handle, tree, geom,
-          [](void* item, void* data) {
-            auto* geom_build = (GEOSGeometry*)item;
-            auto* payload = (Payload*)data;
-            auto* geom_stream = payload->geom;
-
-            if (GetGeosPredicateFn(payload->predicate)(payload->handle, geom_build,
-                                                       geom_stream) == 1) {
-              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
-              auto stream_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
-              payload->build_indices.push_back(build_id);
-              payload->stream_indices.push_back(payload->stream_index_offset + stream_id);
-            }
-          },
-          (void*)&payload);
-    }
-
-    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
-    ASSERT_EQ(payload.stream_indices.size(), stream_indices.size());
-    sort_vectors_by_index(payload.build_indices, payload.stream_indices);
-    sort_vectors_by_index(build_indices, stream_indices);
-    for (size_t j = 0; j < build_indices.size(); j++) {
-      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
-      ASSERT_EQ(payload.stream_indices[j], stream_indices[j]);
-    }
-    array_index_offset += array->length();
-  }
-  GEOSSTRtree_destroy_r(handle.handle, tree);
-}
-
-TEST(JoinerTest, PIPContainsParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(poly_path, point_path, Predicate::kContains, 10);
-  }
-}
-
-TEST(JoinerTest, PIPWithinParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(point_path, poly_path, Predicate::kWithin, 10);
-  }
-}
-
-TEST(JoinerTest, PolyPointIntersectsParquet) {
-  using namespace TestUtils;
-  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
-
-  std::vector<std::string> polys{
-      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
-      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
-  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
-                                  GetTestDataPath("countries/generated_points.parquet")};
-
-  for (int i = 0; i < polys.size(); i++) {
-    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
-    auto point_path = TestUtils::GetCanonicalPath(points[i]);
-    TestJoiner(point_path, poly_path, Predicate::kIntersects, 10);
-  }
-}
-
-TEST(JoinerTest, PolygonPolygonContains) {
-  SpatialJoiner::SpatialJoinerConfig config;
-  std::string ptx_root = TestUtils::GetTestShaderPath();
-  config.ptx_root = ptx_root.c_str();
-  SpatialJoiner spatial_joiner;
-
-  nanoarrow::UniqueArrayStream poly1_stream, poly2_stream;
-
-  auto poly1_path = TestUtils::GetTestDataPath("arrowipc/test_polygons1.arrows");
-  auto poly2_path = TestUtils::GetTestDataPath("arrowipc/test_polygons2.arrows");
-
-  ArrayStreamFromIpc(poly1_path, "geometry", poly1_stream.get());
-  ArrayStreamFromIpc(poly2_path, "geometry", poly2_stream.get());
-
-  nanoarrow::UniqueSchema build_schema, stream_schema;
-  nanoarrow::UniqueArray build_array, stream_array;
-  ArrowError error;
-  ArrowErrorSet(&error, "");
-  int n_row_groups = 100;
-  int array_index_offset = 0;
-  std::vector<uint32_t> build_indices, stream_indices;
-  geoarrow::geos::ArrayReader reader;
-
-  class GEOSCppHandle {
-   public:
-    GEOSContextHandle_t handle;
-
-    GEOSCppHandle() { handle = GEOS_init_r(); }
-
-    ~GEOSCppHandle() { GEOS_finish_r(handle); }
-  };
-  GEOSCppHandle handle;
-
-  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
-
-  geoarrow::geos::GeometryVector geom_polygons1(handle.handle);
-  geoarrow::geos::GeometryVector geom_polygons2(handle.handle);
-  struct Payload {
-    GEOSContextHandle_t handle;
-    const GEOSGeometry* geom;
-    int64_t build_index_offset;
-    int64_t stream_index_offset;
-    std::vector<int64_t> build_indices;
-    std::vector<int64_t> stream_indices;
-  };
-
-  int64_t build_count = 0;
-  spatial_joiner.Init(&config);
-  for (int i = 0; i < n_row_groups; i++) {
-    ASSERT_EQ(ArrowArrayStreamGetNext(poly1_stream.get(), build_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(poly1_stream.get(), build_schema.get(), &error),
-              NANOARROW_OK);
-
-    ASSERT_EQ(ArrowArrayStreamGetNext(poly2_stream.get(), stream_array.get(), &error),
-              NANOARROW_OK);
-    ASSERT_EQ(ArrowArrayStreamGetSchema(poly2_stream.get(), stream_schema.get(), &error),
-              NANOARROW_OK);
-
-    spatial_joiner.Clear();
-    spatial_joiner.PushBuild(nullptr, build_array.get(), 0, build_array->length);
-    auto context = spatial_joiner.CreateContext();
-
-    build_indices.clear();
-    stream_indices.clear();
-    spatial_joiner.FinishBuilding();
-    spatial_joiner.PushStream(context.get(), nullptr, stream_array.get(), 0,
-                              stream_array->length, Predicate::kContains, &build_indices,
-                              &stream_indices, array_index_offset);
-    geom_polygons1.resize(build_array->length);
-    geom_polygons2.resize(stream_array->length);
-
-    size_t n_polygons1 = 0, n_polygons2 = 0;
-    ASSERT_EQ(reader.Read(build_array.get(), 0, build_array->length,
-                          geom_polygons1.mutable_data(), &n_polygons1),
-              GEOARROW_GEOS_OK);
-    ASSERT_EQ(reader.Read(stream_array.get(), 0, stream_array->length,
-                          geom_polygons2.mutable_data(), &n_polygons2),
-              GEOARROW_GEOS_OK);
-
-    auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
-
-    for (size_t j = 0; j < n_polygons1; j++) {
-      auto* geom_polygon = geom_polygons1.borrow(j);
-      auto* box = GEOSEnvelope_r(handle.handle, geom_polygon);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom_polygon, (void*)j);
-      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom_polygon);
-      GEOSGeom_destroy_r(handle.handle, box);
-    }
-    ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
-
-    Payload payload;
-    payload.handle = handle.handle;
-
-    payload.build_index_offset = build_count;
-    payload.stream_index_offset = array_index_offset;
-
-    for (size_t j = 0; j < n_polygons2; j++) {
-      auto* geom_poly2 = geom_polygons2.borrow(j);
-      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom_poly2, (void*)j);
-
-      payload.geom = geom_poly2;
-
-      GEOSSTRtree_query_r(
-          handle.handle, tree, geom_poly2,
-          [](void* item, void* data) {
-            auto* polygon1 = (GEOSGeometry*)item;
-            auto* payload = (Payload*)data;
-            auto* polygon2 = payload->geom;
-
-            if (GEOSContains_r(payload->handle, polygon1, polygon2) == 1) {
-              auto polygon1_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, polygon1);
-              auto polygon2_id =
-                  (size_t)GEOSGeom_getUserData_r(payload->handle, polygon2);
-              payload->build_indices.push_back(payload->build_index_offset + polygon1_id);
-              payload->stream_indices.push_back(payload->stream_index_offset +
-                                                polygon2_id);
-            }
-          },
-          (void*)&payload);
-    }
-
-    GEOSSTRtree_destroy_r(handle.handle, tree);
-
-    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
-
-    build_count += build_array->length;
-    array_index_offset += stream_array->length;
-  }
-}
-
-}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
index f8a762974..d364add91 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/loader_test.cu
@@ -45,6 +45,7 @@ TYPED_TEST(WKBLoaderTest, Point) {
   using point_t = typename TypeParam::first_type;
   using index_t = typename TypeParam::second_type;
   nanoarrow::UniqueArrayStream stream;
+  nanoarrow::UniqueSchema schema;
   ArrayStreamFromWKT({{"POINT (0 0)"},
                       {"POINT (10 20)", "POINT (-5.5 -12.3)"},
                       {"POINT (100 -50)", "POINT (3.1415926535 2.7182818284)",
@@ -62,11 +63,14 @@ TYPED_TEST(WKBLoaderTest, Point) {
     nanoarrow::UniqueArray array;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+        << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -103,13 +107,17 @@ TYPED_TEST(WKBLoaderTest, MultiPoint) {
 
   while (1) {
     nanoarrow::UniqueArray array;
+    nanoarrow::UniqueSchema schema;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -145,6 +153,7 @@ TYPED_TEST(WKBLoaderTest, PointMultiPoint) {
   using point_t = typename TypeParam::first_type;
   using index_t = typename TypeParam::second_type;
   nanoarrow::UniqueArrayStream stream;
+  nanoarrow::UniqueSchema schema;
   ArrayStreamFromWKT({{"POINT (1 2)", "MULTIPOINT ((3 4), (5 6))"},
                       {"POINT (7 8)", "MULTIPOINT ((9 10))"},
                       {"MULTIPOINT EMPTY", "POINT (11 12)"}},
@@ -158,11 +167,14 @@ TYPED_TEST(WKBLoaderTest, PointMultiPoint) {
     nanoarrow::UniqueArray array;
     ArrowError error;
     ArrowErrorSet(&error, "");
-    EXPECT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+    ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+    ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+        << error.message;
     if (array->length == 0) {
       break;
     }
-    loader.Parse(cuda_stream, array.get(), 0, array->length);
+    loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   }
 
   auto geometries = loader.Finish(cuda_stream);
@@ -207,6 +219,7 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderWithHoles) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
@@ -215,9 +228,12 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderWithHoles) {
 
   loader.Init();
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto geometries = loader.Finish(cuda_stream);
 
   auto points = TestUtils::ToVector(cuda_stream, geometries.get_points());
@@ -327,17 +343,21 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygon) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
   ParallelWkbLoader<point_t, index_t> loader;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
 
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
@@ -431,6 +451,7 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygonLocate) {
       GEOARROW_TYPE_WKB, stream.get());
 
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
@@ -438,9 +459,12 @@ TYPED_TEST(WKBLoaderTest, PolygonWKBLoaderMultipolygonLocate) {
   rmm::cuda_stream cuda_stream;
 
   loader.Init();
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+    << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
 
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
@@ -498,18 +522,21 @@ TYPED_TEST(WKBLoaderTest, MixTypes) {
       },
       GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
   ParallelWkbLoader<point_t, index_t> loader;
 
   loader.Init();
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto geometries = loader.Finish(cuda_stream);
   const auto& offsets = geometries.get_offsets();
 
@@ -598,19 +625,22 @@ TYPED_TEST(WKBLoaderTest, GeomCollection) {
         "MULTIPOLYGON(((30 20, 45 40, 10 40, 30 20)), ((15 5, 40 10, 10 30, 15 5), (20 15, 35 15, 35 25, 20 25, 20 15)))"}},
       GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
   rmm::cuda_stream cuda_stream;
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
 
   ParallelWkbLoader<point_t, index_t> loader;
   typename ParallelWkbLoader<point_t, index_t>::Config config;
 
   loader.Init(config);
 
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto geometries = loader.Finish(cuda_stream);
 
   const auto& offsets = geometries.get_offsets();
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/main.cc b/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
index a8b3c21f3..f89c68fcf 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/main.cc
@@ -17,6 +17,8 @@
 #include <filesystem>  // Requires C++17
 #include <iostream>
 #include <string>
+
+#include "gpuspatial_testing.hpp"
 #include "gtest/gtest.h"
 
 namespace TestUtils {
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu
new file mode 100644
index 000000000..67f7846e9
--- /dev/null
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/refiner_test.cu
@@ -0,0 +1,739 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+#include "array_stream.hpp"
+#include "gpuspatial/index/rt_spatial_index.hpp"
+#include "gpuspatial/loader/device_geometries.cuh"
+#include "gpuspatial/refine/rt_spatial_refiner.hpp"
+#include "test_common.hpp"
+
+#include "geoarrow_geos/geoarrow_geos.hpp"
+#include "nanoarrow/nanoarrow.hpp"
+
+#include <geoarrow/geoarrow.h>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <numeric>  // For std::iota
+
+#include "gpuspatial/index/rt_spatial_index.cuh"
+#include "gpuspatial/refine/rt_spatial_refiner.cuh"
+
+namespace gpuspatial {
+// Function to read a single Parquet file and extract a column.
+static arrow::Status ReadParquetFromFile(
+    arrow::fs::FileSystem* fs,     // 1. Filesystem pointer (e.g., LocalFileSystem)
+    const std::string& file_path,  // 2. Single file path instead of a folder
+    int64_t batch_size, const char* column_name,
+    std::vector<std::shared_ptr<arrow::Array>>& out_arrays) {
+  // 1. Get FileInfo for the single path
+  ARROW_ASSIGN_OR_RAISE(auto file_info, fs->GetFileInfo(file_path));
+
+  // Check if the path points to a file
+  if (file_info.type() != arrow::fs::FileType::File) {
+    return arrow::Status::Invalid("Path is not a file: ", file_path);
+  }
+
+  std::cout << "--- Processing Parquet file: " << file_path << " ---" << std::endl;
+
+  // 2. Open the input file
+  ARROW_ASSIGN_OR_RAISE(auto input_file, fs->OpenInputFile(file_info));
+
+  // 3. Open the Parquet file and create an Arrow reader
+  ARROW_ASSIGN_OR_RAISE(auto arrow_reader, parquet::arrow::OpenFile(
+                                               input_file, arrow::default_memory_pool()));
+
+  // 4. Set the batch size
+  arrow_reader->set_batch_size(batch_size);
+
+  // 5. Get the RecordBatchReader
+  auto rb_reader = arrow_reader->GetRecordBatchReader().ValueOrDie();
+  // 6. Read all record batches and extract the column
+  while (true) {
+    std::shared_ptr<arrow::RecordBatch> batch;
+
+    // Read the next batch
+    ARROW_THROW_NOT_OK(rb_reader->ReadNext(&batch));
+
+    // Check for end of stream
+    if (!batch) {
+      break;
+    }
+
+    // Extract the specified column and add to the output vector
+    std::shared_ptr<arrow::Array> column_array = batch->GetColumnByName(column_name);
+    if (!column_array) {
+      return arrow::Status::Invalid("Column not found: ", column_name);
+    }
+    out_arrays.push_back(column_array);
+  }
+
+  std::cout << "Finished reading. Total arrays extracted: " << out_arrays.size()
+            << std::endl;
+  return arrow::Status::OK();
+}
+
+// Helper to concatenate C-style ArrowArrays
+arrow::Result<std::shared_ptr<arrow::Array>> ConcatCArrays(
+    const std::vector<ArrowArray*>& c_arrays, ArrowSchema* c_schema) {
+  // 1. Import the schema ONCE into a C++ DataType object.
+  //    This effectively "consumes" c_schema.
+  ARROW_ASSIGN_OR_RAISE(auto type, arrow::ImportType(c_schema));
+
+  arrow::ArrayVector arrays_to_concat;
+  arrays_to_concat.reserve(c_arrays.size());
+
+  // 2. Loop through arrays using the C++ type object.
+  for (ArrowArray* c_arr : c_arrays) {
+    // Use the ImportArray overload that takes std::shared_ptr<DataType>.
+    // This validates c_arr against 'type' without consuming 'type'.
+    ARROW_ASSIGN_OR_RAISE(auto arr, arrow::ImportArray(c_arr, type));
+    arrays_to_concat.push_back(arr);
+  }
+
+  return arrow::Concatenate(arrays_to_concat);
+}
+
+using GeosBinaryPredicateFn = char (*)(GEOSContextHandle_t, const GEOSGeometry*,
+                                       const GEOSGeometry*);
+
+static GeosBinaryPredicateFn GetGeosPredicateFn(Predicate predicate) {
+  switch (predicate) {
+    case Predicate::kContains:
+      return &GEOSContains_r;
+    case Predicate::kIntersects:
+      return &GEOSIntersects_r;
+    case Predicate::kWithin:
+      return &GEOSWithin_r;
+    case Predicate::kEquals:
+      return &GEOSEquals_r;
+    case Predicate::kTouches:
+      return &GEOSTouches_r;
+    default:
+      throw std::out_of_range("Unsupported GEOS predicate enumeration value.");
+  }
+}
+
+std::vector<std::shared_ptr<arrow::Array>> ReadParquet(const std::string& path,
+                                                       int batch_size = 100) {
+  using namespace TestUtils;
+
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::shared_ptr<arrow::Array>> build_arrays;
+  ARROW_THROW_NOT_OK(
+      ReadParquetFromFile(fs.get(), path, batch_size, "geometry", build_arrays));
+  return build_arrays;
+}
+
+void ReadArrowIPC(const std::string& path, std::vector<nanoarrow::UniqueArray>& arrays,
+                  std::vector<nanoarrow::UniqueSchema>& schemas,
+                  uint32_t limit = std::numeric_limits<uint32_t>::max()) {
+  nanoarrow::UniqueArrayStream stream;
+  ArrowError error;
+
+  // Assuming this helper exists in your context or you implement it via Arrow C++
+  // (It populates the C-stream from the file)
+  ArrayStreamFromIpc(path, "geometry", stream.get());
+  uint32_t count = 0;
+  while (true) {
+    // 1. Create fresh objects for this iteration
+    nanoarrow::UniqueArray array;
+    nanoarrow::UniqueSchema schema;
+
+    // 2. Get the next batch
+    // Note: This function expects 'array' to be empty/released.
+    int code = ArrowArrayStreamGetNext(stream.get(), array.get(), &error);
+    if (code != NANOARROW_OK) {
+      // Handle error (log or throw)
+      break;
+    }
+
+    // 3. CHECK END OF STREAM
+    // If release is NULL, the stream is finished.
+    if (array->release == nullptr) {
+      break;
+    }
+
+    // 4. Get the schema for this specific batch
+    // ArrowArrayStreamGetSchema creates a deep copy of the schema into 'schema'.
+    code = ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error);
+    if (code != NANOARROW_OK) {
+      // Handle error
+      break;
+    }
+
+    // 5. Move ownership to the output vectors
+    arrays.push_back(std::move(array));
+    schemas.push_back(std::move(schema));
+    count += array->length;
+    if (count >= limit) break;
+  }
+}
+
+void TestJoiner(ArrowSchema* build_schema, std::vector<ArrowArray*>& build_arrays,
+                ArrowSchema* probe_schema, std::vector<ArrowArray*>& probe_arrays,
+                Predicate predicate) {
+  using namespace TestUtils;
+  using coord_t = double;
+  using fpoint_t = Point<coord_t, 2>;
+  using box_t = Box<fpoint_t>;
+
+  auto rt_engine = std::make_shared<RTEngine>();
+
+  {
+    std::string ptx_root = GetTestShaderPath();
+    auto config = get_default_rt_config(ptx_root);
+    rt_engine->Init(config);
+  }
+
+  RTSpatialIndexConfig idx_config;
+  idx_config.rt_engine = rt_engine;
+  auto rt_index = CreateRTSpatialIndex<coord_t, 2>(idx_config);
+  RTSpatialRefinerConfig refiner_config;
+  refiner_config.rt_engine = rt_engine;
+  auto rt_refiner = CreateRTSpatialRefiner(refiner_config);
+
+  geoarrow::geos::ArrayReader reader;
+
+  class GEOSCppHandle {
+   public:
+    GEOSContextHandle_t handle;
+
+    GEOSCppHandle() { handle = GEOS_init_r(); }
+
+    ~GEOSCppHandle() { GEOS_finish_r(handle); }
+  };
+  GEOSCppHandle handle;
+
+  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
+
+  geoarrow::geos::GeometryVector geom_build(handle.handle);
+  size_t total_build_length = 0;
+
+  for (auto& array : build_arrays) {
+    total_build_length += array->length;
+  }
+
+  // Using GEOS for reference
+  geom_build.resize(total_build_length);
+  size_t tail_build = 0;
+  auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
+  for (auto& array : build_arrays) {
+    // geos for reference
+    size_t n_build;
+
+    ASSERT_EQ(reader.Read((ArrowArray*)array, 0, array->length,
+                          geom_build.mutable_data() + tail_build, &n_build),
+              GEOARROW_GEOS_OK);
+    ASSERT_EQ(array->length, n_build);
+    std::vector<box_t> rects;
+
+    for (size_t offset = tail_build; offset < tail_build + n_build; offset++) {
+      auto* geom = geom_build.borrow(offset);
+      auto* box = GEOSEnvelope_r(handle.handle, geom);
+
+      double xmin, ymin, xmax, ymax;
+      if (GEOSGeom_getExtent_r(handle.handle, box, &xmin, &ymin, &xmax, &ymax) == 0) {
+        printf("Error getting extent\n");
+        xmin = 0;
+        ymin = 0;
+        xmax = -1;
+        ymax = -1;
+      }
+
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+
+      rects.push_back(bbox);
+
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
+      GEOSGeom_destroy_r(handle.handle, box);
+    }
+    rt_index->PushBuild(rects.data(), rects.size());
+    tail_build += n_build;
+  }
+  rt_index->FinishBuilding();
+  ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
+
+  auto build_array_ptr = ConcatCArrays(build_arrays, build_schema).ValueOrDie();
+
+  nanoarrow::UniqueArray uniq_build_array;
+  nanoarrow::UniqueSchema uniq_build_schema;
+  ARROW_THROW_NOT_OK(arrow::ExportArray(*build_array_ptr, uniq_build_array.get(),
+                                        uniq_build_schema.get()));
+  // Start stream processing
+
+  for (auto& array : probe_arrays) {
+    geoarrow::geos::GeometryVector geom_stream(handle.handle);
+    size_t n_stream;
+    geom_stream.resize(array->length);
+
+    ASSERT_EQ(reader.Read(array, 0, array->length, geom_stream.mutable_data(), &n_stream),
+              GEOARROW_GEOS_OK);
+
+    std::vector<box_t> queries;
+
+    for (size_t i = 0; i < array->length; i++) {
+      auto* geom = geom_stream.borrow(i);
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, geom, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      queries.push_back(bbox);
+    }
+
+    std::vector<uint32_t> build_indices, stream_indices;
+
+    rt_index->Probe(queries.data(), queries.size(), &build_indices, &stream_indices);
+    auto old_size = build_indices.size();
+
+    auto new_size = rt_refiner->Refine(
+        uniq_build_schema.get(), uniq_build_array.get(), probe_schema, array, predicate,
+        build_indices.data(), stream_indices.data(), build_indices.size());
+
+    build_indices.resize(new_size);
+    stream_indices.resize(new_size);
+
+    struct Payload {
+      GEOSContextHandle_t handle;
+      const GEOSGeometry* geom;
+      std::vector<uint32_t> build_indices;
+      std::vector<uint32_t> stream_indices;
+      Predicate predicate;
+    };
+
+    Payload payload;
+    payload.predicate = predicate;
+    payload.handle = handle.handle;
+
+    for (size_t offset = 0; offset < n_stream; offset++) {
+      auto* geom = geom_stream.borrow(offset);
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      payload.geom = geom;
+
+      GEOSSTRtree_query_r(
+          handle.handle, tree, geom,
+          [](void* item, void* data) {
+            auto* geom_build = (GEOSGeometry*)item;
+            auto* payload = (Payload*)data;
+            auto* geom_stream = payload->geom;
+
+            if (GetGeosPredicateFn(payload->predicate)(payload->handle, geom_build,
+                                                       geom_stream) == 1) {
+              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
+              auto stream_id =
+                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
+              payload->build_indices.push_back(build_id);
+              payload->stream_indices.push_back(stream_id);
+            }
+          },
+          (void*)&payload);
+    }
+
+    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
+    ASSERT_EQ(payload.stream_indices.size(), stream_indices.size());
+    sort_vectors_by_index(payload.build_indices, payload.stream_indices);
+    sort_vectors_by_index(build_indices, stream_indices);
+    for (size_t j = 0; j < build_indices.size(); j++) {
+      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
+      ASSERT_EQ(payload.stream_indices[j], stream_indices[j]);
+    }
+  }
+  GEOSSTRtree_destroy_r(handle.handle, tree);
+}
+
+void TestJoinerLoaded(ArrowSchema* build_schema, std::vector<ArrowArray*>& build_arrays,
+                      ArrowSchema* probe_schema, std::vector<ArrowArray*>& probe_arrays,
+                      Predicate predicate, bool pipelined = false) {
+  using namespace TestUtils;
+  using coord_t = double;
+  using fpoint_t = Point<coord_t, 2>;
+  using box_t = Box<fpoint_t>;
+
+  auto rt_engine = std::make_shared<RTEngine>();
+
+  {
+    std::string ptx_root = TestUtils::GetTestShaderPath();
+    auto config = get_default_rt_config(ptx_root);
+    rt_engine->Init(config);
+  }
+
+  RTSpatialIndexConfig idx_config;
+  idx_config.rt_engine = rt_engine;
+  auto rt_index = CreateRTSpatialIndex<coord_t, 2>(idx_config);
+
+  RTSpatialRefinerConfig refiner_config;
+  refiner_config.rt_engine = rt_engine;
+  if (pipelined) {
+    refiner_config.pipeline_batches = 10;
+  }
+  auto rt_refiner = CreateRTSpatialRefiner(refiner_config);
+  geoarrow::geos::ArrayReader reader;
+
+  class GEOSCppHandle {
+   public:
+    GEOSContextHandle_t handle;
+
+    GEOSCppHandle() { handle = GEOS_init_r(); }
+
+    ~GEOSCppHandle() { GEOS_finish_r(handle); }
+  };
+  GEOSCppHandle handle;
+
+  reader.InitFromEncoding(handle.handle, GEOARROW_GEOS_ENCODING_WKB);
+
+  geoarrow::geos::GeometryVector geom_build(handle.handle);
+  size_t total_build_length = 0;
+
+  for (auto& array : build_arrays) {
+    total_build_length += array->length;
+  }
+
+  // Using GEOS for reference
+  geom_build.resize(total_build_length);
+  size_t tail_build = 0;
+  auto* tree = GEOSSTRtree_create_r(handle.handle, 10);
+  for (auto& array : build_arrays) {
+    // geos for reference
+    size_t n_build;
+
+    ASSERT_EQ(reader.Read((ArrowArray*)array, 0, array->length,
+                          geom_build.mutable_data() + tail_build, &n_build),
+              GEOARROW_GEOS_OK);
+    ASSERT_EQ(array->length, n_build);
+    std::vector<box_t> rects;
+
+    for (size_t offset = tail_build; offset < tail_build + n_build; offset++) {
+      auto* geom = geom_build.borrow(offset);
+      auto* box = GEOSEnvelope_r(handle.handle, geom);
+
+      double xmin, ymin, xmax, ymax;
+      if (GEOSGeom_getExtent_r(handle.handle, box, &xmin, &ymin, &xmax, &ymax) == 0) {
+        xmin = 0;
+        ymin = 0;
+        xmax = -1;
+        ymax = -1;
+      }
+
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+
+      rects.push_back(bbox);
+
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      GEOSSTRtree_insert_r(handle.handle, tree, box, (void*)geom);
+      GEOSGeom_destroy_r(handle.handle, box);
+    }
+    rt_index->PushBuild(rects.data(), rects.size());
+    tail_build += n_build;
+  }
+  rt_index->FinishBuilding();
+  ASSERT_EQ(GEOSSTRtree_build_r(handle.handle, tree), 1);
+
+  auto build_array_ptr = ConcatCArrays(build_arrays, build_schema).ValueOrDie();
+
+  nanoarrow::UniqueArray uniq_build_array;
+  nanoarrow::UniqueSchema uniq_build_schema;
+  ARROW_THROW_NOT_OK(arrow::ExportArray(*build_array_ptr, uniq_build_array.get(),
+                                        uniq_build_schema.get()));
+  // Start stream processing
+
+  rt_refiner->PushBuild(uniq_build_schema.get(), uniq_build_array.get());
+  rt_refiner->FinishBuilding();
+
+  for (auto& array : probe_arrays) {
+    geoarrow::geos::GeometryVector geom_stream(handle.handle);
+    size_t n_stream;
+    geom_stream.resize(array->length);
+
+    ASSERT_EQ(reader.Read(array, 0, array->length, geom_stream.mutable_data(), &n_stream),
+              GEOARROW_GEOS_OK);
+
+    std::vector<box_t> queries;
+
+    for (size_t i = 0; i < array->length; i++) {
+      auto* geom = geom_stream.borrow(i);
+      double xmin, ymin, xmax, ymax;
+      int result = GEOSGeom_getExtent_r(handle.handle, geom, &xmin, &ymin, &xmax, &ymax);
+      ASSERT_EQ(result, 1);
+      box_t bbox(fpoint_t((float)xmin, (float)ymin), fpoint_t((float)xmax, (float)ymax));
+      queries.push_back(bbox);
+    }
+
+    std::vector<uint32_t> build_indices, stream_indices;
+
+    rt_index->Probe(queries.data(), queries.size(), &build_indices, &stream_indices);
+    auto old_size = build_indices.size();
+
+    auto new_size =
+        rt_refiner->Refine(probe_schema, array, predicate, build_indices.data(),
+                           stream_indices.data(), build_indices.size());
+
+    printf("Old size %u, new size %u\n", (unsigned)old_size, (unsigned)new_size);
+    build_indices.resize(new_size);
+    stream_indices.resize(new_size);
+
+    struct Payload {
+      GEOSContextHandle_t handle;
+      const GEOSGeometry* geom;
+      std::vector<uint32_t> build_indices;
+      std::vector<uint32_t> stream_indices;
+      Predicate predicate;
+    };
+
+    Payload payload;
+    payload.predicate = predicate;
+    payload.handle = handle.handle;
+
+    for (size_t offset = 0; offset < n_stream; offset++) {
+      auto* geom = geom_stream.borrow(offset);
+      GEOSGeom_setUserData_r(handle.handle, (GEOSGeometry*)geom, (void*)offset);
+      payload.geom = geom;
+
+      GEOSSTRtree_query_r(
+          handle.handle, tree, geom,
+          [](void* item, void* data) {
+            auto* geom_build = (GEOSGeometry*)item;
+            auto* payload = (Payload*)data;
+            auto* geom_stream = payload->geom;
+
+            if (GetGeosPredicateFn(payload->predicate)(payload->handle, geom_build,
+                                                       geom_stream) == 1) {
+              auto build_id = (size_t)GEOSGeom_getUserData_r(payload->handle, geom_build);
+              auto stream_id =
+                  (size_t)GEOSGeom_getUserData_r(payload->handle, geom_stream);
+              payload->build_indices.push_back(build_id);
+              payload->stream_indices.push_back(stream_id);
+            }
+          },
+          (void*)&payload);
+    }
+
+    ASSERT_EQ(payload.build_indices.size(), build_indices.size());
+    ASSERT_EQ(payload.stream_indices.size(), stream_indices.size());
+    sort_vectors_by_index(payload.build_indices, payload.stream_indices);
+    sort_vectors_by_index(build_indices, stream_indices);
+    for (size_t j = 0; j < build_indices.size(); j++) {
+      ASSERT_EQ(payload.build_indices[j], build_indices[j]);
+      ASSERT_EQ(payload.stream_indices[j], stream_indices[j]);
+    }
+  }
+  GEOSSTRtree_destroy_r(handle.handle, tree);
+}
+
+TEST(JoinerTest, PIPContainsParquet) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoinerLoaded(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+                     point_c_arrays, Predicate::kContains);
+  }
+}
+
+TEST(JoinerTest, PIPContainsParquetLoaded) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoinerLoaded(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+                     point_c_arrays, Predicate::kContains);
+  }
+}
+
+TEST(JoinerTest, PIPContainsParquetPipelined) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{
+      GetTestDataPath("cities/natural-earth_cities_geo.parquet"),
+      GetTestDataPath("countries/natural-earth_countries_geo.parquet")};
+  std::vector<std::string> points{GetTestDataPath("cities/generated_points.parquet"),
+                                  GetTestDataPath("countries/generated_points.parquet")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    auto poly_arrays = ReadParquet(poly_path, 1000);
+    auto point_arrays = ReadParquet(point_path, 1000);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    for (auto& arr : poly_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, poly_uniq_arrays.emplace_back().get(),
+                                            poly_uniq_schema.emplace_back().get()));
+    }
+    for (auto& arr : point_arrays) {
+      ARROW_THROW_NOT_OK(arrow::ExportArray(*arr, point_uniq_arrays.emplace_back().get(),
+                                            point_uniq_schema.emplace_back().get()));
+    }
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+    TestJoinerLoaded(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+                     point_c_arrays, Predicate::kContains, true);
+  }
+}
+
+TEST(JoinerTest, PIPContainsArrowIPC) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{GetTestDataPath("arrowipc/test_polygons.arrows")};
+  std::vector<std::string> points{GetTestDataPath("arrowipc/test_points.arrows")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    ReadArrowIPC(poly_path, poly_uniq_arrays, poly_uniq_schema);
+    ReadArrowIPC(point_path, point_uniq_arrays, point_uniq_schema);
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+
+    TestJoiner(poly_uniq_schema[0].get(), poly_c_arrays, point_uniq_schema[0].get(),
+               point_c_arrays, Predicate::kContains);
+  }
+}
+
+TEST(JoinerTest, PIPWithinArrowIPC) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys{GetTestDataPath("arrowipc/test_polygons.arrows")};
+  std::vector<std::string> points{GetTestDataPath("arrowipc/test_points.arrows")};
+
+  for (int i = 0; i < polys.size(); i++) {
+    auto poly_path = TestUtils::GetTestDataPath(polys[i]);
+    auto point_path = TestUtils::GetCanonicalPath(points[i]);
+    std::vector<nanoarrow::UniqueArray> poly_uniq_arrays, point_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly_uniq_schema, point_uniq_schema;
+
+    ReadArrowIPC(poly_path, poly_uniq_arrays, poly_uniq_schema);
+    ReadArrowIPC(point_path, point_uniq_arrays, point_uniq_schema);
+
+    std::vector<ArrowArray*> poly_c_arrays, point_c_arrays;
+    for (auto& arr : poly_uniq_arrays) {
+      poly_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : point_uniq_arrays) {
+      point_c_arrays.push_back(arr.get());
+    }
+
+    TestJoiner(point_uniq_schema[0].get(), point_c_arrays, poly_uniq_schema[0].get(),
+               poly_c_arrays, Predicate::kWithin);
+  }
+}
+
+TEST(JoinerTest, PolygonPolygonContains) {
+  using namespace TestUtils;
+  auto fs = std::make_shared<arrow::fs::LocalFileSystem>();
+
+  std::vector<std::string> polys1{GetTestDataPath("arrowipc/test_polygons1.arrows")};
+  std::vector<std::string> polys2{GetTestDataPath("arrowipc/test_polygons2.arrows")};
+
+  for (int i = 0; i < polys1.size(); i++) {
+    auto poly1_path = TestUtils::GetTestDataPath(polys1[i]);
+    auto poly2_path = TestUtils::GetCanonicalPath(polys2[i]);
+    std::vector<nanoarrow::UniqueArray> poly1_uniq_arrays, poly2_uniq_arrays;
+    std::vector<nanoarrow::UniqueSchema> poly1_uniq_schema, poly2_uniq_schema;
+
+    ReadArrowIPC(poly1_path, poly1_uniq_arrays, poly1_uniq_schema, 100);
+    ReadArrowIPC(poly2_path, poly2_uniq_arrays, poly2_uniq_schema, 100);
+
+    std::vector<ArrowArray*> poly1_c_arrays, poly2_c_arrays;
+    for (auto& arr : poly1_uniq_arrays) {
+      poly1_c_arrays.push_back(arr.get());
+    }
+    for (auto& arr : poly2_uniq_arrays) {
+      poly2_c_arrays.push_back(arr.get());
+    }
+
+    TestJoiner(poly1_uniq_schema[0].get(), poly1_c_arrays, poly2_uniq_schema[0].get(),
+               poly2_c_arrays, Predicate::kIntersects);
+  }
+}
+}  // namespace gpuspatial
diff --git a/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu b/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
index fabcd3f5c..6630ef071 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
+++ b/c/sedona-libgpuspatial/libgpuspatial/test/related_test.cu
@@ -58,15 +58,18 @@ void ParseWKTPoint(const char* wkt, POINT_T& point) {
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
-
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   auto h_vec = TestUtils::ToVector(cuda_stream, device_geometries.get_points());
   cuda_stream.synchronize();
@@ -79,15 +82,19 @@ void ParseWKTMultiPoint(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
 
   ctx.prefix_sum1 = TestUtils::ToVector(
@@ -108,15 +115,19 @@ void ParseWKTLineString(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().line_string_offsets.ps_num_points);
@@ -136,15 +147,19 @@ void ParseWKTMultiLineString(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream,
@@ -169,15 +184,19 @@ void ParseWKTPolygon(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().polygon_offsets.ps_num_rings);
@@ -200,15 +219,19 @@ void ParseWKTMultiPolygon(Context<POINT_T, INDEX_T>& ctx, const char* wkt,
   nanoarrow::UniqueArrayStream stream;
   ArrayStreamFromWKT({{wkt}}, GEOARROW_TYPE_WKB, stream.get());
   nanoarrow::UniqueArray array;
+  nanoarrow::UniqueSchema schema;
   ArrowError error;
   ArrowErrorSet(&error, "");
 
-  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK);
+  ASSERT_EQ(ArrowArrayStreamGetSchema(stream.get(), schema.get(), &error), NANOARROW_OK)
+      << error.message;
+  ASSERT_EQ(ArrowArrayStreamGetNext(stream.get(), array.get(), &error), NANOARROW_OK)
+      << error.message;
   loader_t loader;
   auto cuda_stream = rmm::cuda_stream_default;
 
   loader.Init();
-  loader.Parse(cuda_stream, array.get(), 0, array->length);
+  loader.Parse(cuda_stream, schema.get(), array.get(), 0, array->length);
   auto device_geometries = loader.Finish(cuda_stream);
   ctx.prefix_sum1 = TestUtils::ToVector(
       cuda_stream, device_geometries.get_offsets().multi_polygon_offsets.ps_num_parts);
diff --git a/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json b/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
index b162d78e2..f593623e8 100644
--- a/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
+++ b/c/sedona-libgpuspatial/libgpuspatial/vcpkg.json
@@ -7,6 +7,7 @@
       "dependencies": [
         "gtest",
         "geos",
+        "zstd",
         {
           "name": "arrow",
           "features": [
diff --git a/c/sedona-libgpuspatial/src/error.rs b/c/sedona-libgpuspatial/src/error.rs
index 3530e40e8..d38897019 100644
--- a/c/sedona-libgpuspatial/src/error.rs
+++ b/c/sedona-libgpuspatial/src/error.rs
@@ -24,7 +24,8 @@ pub enum GpuSpatialError {
     Init(String),
     PushBuild(String),
     FinishBuild(String),
-    PushStream(String),
+    Probe(String),
+    Refine(String),
 }
 
 impl From<ArrowError> for GpuSpatialError {
@@ -48,8 +49,11 @@ impl fmt::Display for GpuSpatialError {
             GpuSpatialError::FinishBuild(errmsg) => {
                 write!(f, "Finish building failed: {}", errmsg)
             }
-            GpuSpatialError::PushStream(errmsg) => {
-                write!(f, "Push stream failed: {}", errmsg)
+            GpuSpatialError::Probe(errmsg) => {
+                write!(f, "Probe failed: {}", errmsg)
+            }
+            GpuSpatialError::Refine(errmsg) => {
+                write!(f, "Refine failed: {}", errmsg)
             }
         }
     }
diff --git a/c/sedona-libgpuspatial/src/lib.rs b/c/sedona-libgpuspatial/src/lib.rs
index 1bcd4ef43..27714897b 100644
--- a/c/sedona-libgpuspatial/src/lib.rs
+++ b/c/sedona-libgpuspatial/src/lib.rs
@@ -23,30 +23,44 @@ mod libgpuspatial;
 #[cfg(gpu_available)]
 mod libgpuspatial_glue_bindgen;
 
-// Import Array trait for len() method (used in gpu_available code)
 #[cfg(gpu_available)]
-use arrow_array::Array;
-
+use std::sync::{Arc, Mutex};
+// Import Array trait for len() method (used in gpu_available code)
+use geo::Rect;
 // Re-exports for GPU functionality
 #[cfg(gpu_available)]
 pub use error::GpuSpatialError;
 #[cfg(gpu_available)]
-pub use libgpuspatial::{GpuSpatialJoinerWrapper, GpuSpatialPredicateWrapper};
+pub use libgpuspatial::{
+    GpuSpatialIndexFloat2DWrapper, GpuSpatialRefinerWrapper, GpuSpatialRelationPredicateWrapper,
+    GpuSpatialRuntimeWrapper,
+};
 #[cfg(gpu_available)]
-pub use libgpuspatial_glue_bindgen::GpuSpatialJoinerContext;
+pub use libgpuspatial_glue_bindgen::SedonaSpatialIndexContext;
+#[cfg(gpu_available)]
+use nvml_wrapper::Nvml;
 
 // Mark GPU types as Send for thread safety
 // SAFETY: The GPU library is designed to be used from multiple threads.
 // Each thread gets its own context, and the underlying GPU library handles thread safety.
 // The raw pointers inside are managed by the C++ library which ensures proper synchronization.
 #[cfg(gpu_available)]
-unsafe impl Send for GpuSpatialJoinerContext {}
+unsafe impl Send for SedonaSpatialIndexContext {}
+#[cfg(gpu_available)]
+unsafe impl Send for libgpuspatial_glue_bindgen::GpuSpatialRuntime {}
+#[cfg(gpu_available)]
+unsafe impl Sync for libgpuspatial_glue_bindgen::GpuSpatialRuntime {}
 
 #[cfg(gpu_available)]
-unsafe impl Send for libgpuspatial_glue_bindgen::GpuSpatialJoiner {}
+unsafe impl Send for libgpuspatial_glue_bindgen::SedonaFloatIndex2D {}
+#[cfg(gpu_available)]
+unsafe impl Send for libgpuspatial_glue_bindgen::SedonaSpatialRefiner {}
+
+#[cfg(gpu_available)]
+unsafe impl Sync for libgpuspatial_glue_bindgen::SedonaFloatIndex2D {}
 
 #[cfg(gpu_available)]
-unsafe impl Send for GpuSpatialJoinerWrapper {}
+unsafe impl Sync for libgpuspatial_glue_bindgen::SedonaSpatialRefiner {}
 
 // Error type for non-GPU builds
 #[cfg(not(gpu_available))]
@@ -58,16 +72,76 @@ pub enum GpuSpatialError {
 
 pub type Result<T> = std::result::Result<T, GpuSpatialError>;
 
+/// Spatial predicates for GPU operations
+#[repr(u32)]
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum GpuSpatialRelationPredicate {
+    Equals = 0,
+    Disjoint = 1,
+    Touches = 2,
+    Contains = 3,
+    Covers = 4,
+    Intersects = 5,
+    Within = 6,
+    CoveredBy = 7,
+}
+
+impl std::fmt::Display for GpuSpatialRelationPredicate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            GpuSpatialRelationPredicate::Equals => write!(f, "equals"),
+            GpuSpatialRelationPredicate::Disjoint => write!(f, "disjoint"),
+            GpuSpatialRelationPredicate::Touches => write!(f, "touches"),
+            GpuSpatialRelationPredicate::Contains => write!(f, "contains"),
+            GpuSpatialRelationPredicate::Covers => write!(f, "covers"),
+            GpuSpatialRelationPredicate::Intersects => write!(f, "intersects"),
+            GpuSpatialRelationPredicate::Within => write!(f, "within"),
+            GpuSpatialRelationPredicate::CoveredBy => write!(f, "coveredby"),
+        }
+    }
+}
+
+#[cfg(gpu_available)]
+impl From<GpuSpatialRelationPredicate> for GpuSpatialRelationPredicateWrapper {
+    fn from(pred: GpuSpatialRelationPredicate) -> Self {
+        match pred {
+            GpuSpatialRelationPredicate::Equals => GpuSpatialRelationPredicateWrapper::Equals,
+            GpuSpatialRelationPredicate::Disjoint => GpuSpatialRelationPredicateWrapper::Disjoint,
+            GpuSpatialRelationPredicate::Touches => GpuSpatialRelationPredicateWrapper::Touches,
+            GpuSpatialRelationPredicate::Contains => GpuSpatialRelationPredicateWrapper::Contains,
+            GpuSpatialRelationPredicate::Covers => GpuSpatialRelationPredicateWrapper::Covers,
+            GpuSpatialRelationPredicate::Intersects => {
+                GpuSpatialRelationPredicateWrapper::Intersects
+            }
+            GpuSpatialRelationPredicate::Within => GpuSpatialRelationPredicateWrapper::Within,
+            GpuSpatialRelationPredicate::CoveredBy => GpuSpatialRelationPredicateWrapper::CoveredBy,
+        }
+    }
+}
+
+/// Global shared GpuSpatialRuntime. Building an instance is expensive, so we share it across all GpuSpatial instances.
+#[cfg(gpu_available)]
+static GLOBAL_GPUSPATIAL_RUNTIME: Mutex<Option<Arc<Mutex<GpuSpatialRuntimeWrapper>>>> =
+    Mutex::new(None);
 /// High-level wrapper for GPU spatial operations
-pub struct GpuSpatialContext {
+pub struct GpuSpatial {
+    #[cfg(gpu_available)]
+    runtime: Option<Arc<Mutex<GpuSpatialRuntimeWrapper>>>,
     #[cfg(gpu_available)]
-    joiner: Option<GpuSpatialJoinerWrapper>,
+    index: Option<GpuSpatialIndexFloat2DWrapper>,
     #[cfg(gpu_available)]
-    context: Option<GpuSpatialJoinerContext>,
-    initialized: bool,
+    refiner: Option<GpuSpatialRefinerWrapper>,
+}
+
+pub struct GpuSpatialOptions {
+    pub cuda_init_memory_pool_ratio: f32,
+    pub concurrency: u32,
+    pub device_id: i32,
+    pub compress_bvh: bool,
+    pub pipeline_batches: u32,
 }
 
-impl GpuSpatialContext {
+impl GpuSpatial {
     pub fn new() -> Result<Self> {
         #[cfg(not(gpu_available))]
         {
@@ -77,197 +151,477 @@ impl GpuSpatialContext {
         #[cfg(gpu_available)]
         {
             Ok(Self {
-                joiner: None,
-                context: None,
-                initialized: false,
+                runtime: None,
+                index: None,
+                refiner: None,
             })
         }
     }
 
-    pub fn init(&mut self) -> Result<()> {
+    pub fn init(&mut self, options: GpuSpatialOptions) -> Result<()> {
         #[cfg(not(gpu_available))]
         {
+            let _ = (concurrency, device_id);
             Err(GpuSpatialError::GpuNotAvailable)
         }
 
         #[cfg(gpu_available)]
         {
-            let mut joiner = GpuSpatialJoinerWrapper::new();
-
             // Get PTX path from OUT_DIR
-            let out_path = std::path::PathBuf::from(env!("OUT_DIR"));
-            let ptx_root = out_path.join("share/gpuspatial/shaders");
-            let ptx_root_str = ptx_root
-                .to_str()
-                .ok_or_else(|| GpuSpatialError::Init("Invalid PTX path".to_string()))?;
-
-            // Initialize with concurrency of 1 for now
-            joiner.init(1, ptx_root_str)?;
-
-            // Create context
-            let mut ctx = GpuSpatialJoinerContext {
-                last_error: std::ptr::null(),
-                private_data: std::ptr::null_mut(),
-                build_indices: std::ptr::null_mut(),
-                stream_indices: std::ptr::null_mut(),
-            };
-            joiner.create_context(&mut ctx);
+            // Acquire the lock for the global shared runtime
+            let mut global_runtime_guard = GLOBAL_GPUSPATIAL_RUNTIME.lock().unwrap();
+
+            // Initialize the global runtime if it hasn't been initialized yet
+            if global_runtime_guard.is_none() {
+                // Get PTX path from OUT_DIR
+                let out_path = std::path::PathBuf::from(env!("OUT_DIR"));
+                let ptx_root = out_path.join("share/gpuspatial/shaders");
+                let ptx_root_str = ptx_root
+                    .to_str()
+                    .ok_or_else(|| GpuSpatialError::Init("Invalid PTX path".to_string()))?;
+
+                let runtime = GpuSpatialRuntimeWrapper::try_new(
+                    options.device_id,
+                    ptx_root_str,
+                    options.cuda_init_memory_pool_ratio,
+                )?;
+                *global_runtime_guard = Some(Arc::new(Mutex::new(runtime)));
+            }
+
+            // Get a clone of the Arc to the shared runtime
+            // safe to unwrap here because we just ensured it is Some
+            let runtime_ref = global_runtime_guard.as_ref().unwrap().clone();
+            // Assign to self
+            self.runtime = Some(runtime_ref);
+
+            let index = GpuSpatialIndexFloat2DWrapper::try_new(
+                self.runtime.as_ref().unwrap(),
+                options.concurrency,
+            )?;
+
+            self.index = Some(index);
+
+            let refiner = GpuSpatialRefinerWrapper::try_new(
+                self.runtime.as_ref().unwrap(),
+                options.concurrency,
+                options.compress_bvh,
+                options.pipeline_batches,
+            )?;
+            self.refiner = Some(refiner);
 
-            self.joiner = Some(joiner);
-            self.context = Some(ctx);
-            self.initialized = true;
             Ok(())
         }
     }
 
-    #[cfg(gpu_available)]
-    pub fn get_joiner_mut(&mut self) -> Option<&mut GpuSpatialJoinerWrapper> {
-        self.joiner.as_mut()
+    pub fn is_gpu_available() -> bool {
+        #[cfg(not(gpu_available))]
+        {
+            false
+        }
+        #[cfg(gpu_available)]
+        {
+            let nvml = match Nvml::init() {
+                Ok(instance) => instance,
+                Err(_) => return false,
+            };
+
+            // Check if the device count is greater than zero
+            match nvml.device_count() {
+                Ok(count) => count > 0,
+                Err(_) => false,
+            }
+        }
     }
 
-    #[cfg(gpu_available)]
-    pub fn get_context_mut(&mut self) -> Option<&mut GpuSpatialJoinerContext> {
-        self.context.as_mut()
+    /// Clear previous build data
+    pub fn index_clear(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let index = self
+                .index
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU index is not available".into()))?;
+
+            // Clear previous build data
+            index.clear();
+            Ok(())
+        }
     }
 
-    pub fn is_initialized(&self) -> bool {
-        self.initialized
+    pub fn index_push_build(&mut self, rects: &[Rect<f32>]) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = rects;
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let index = self
+                .index
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU index not available".into()))?;
+
+            unsafe { index.push_build(rects.as_ptr() as *const f32, rects.len() as u32) }
+        }
+    }
+
+    pub fn index_finish_building(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        return Err(GpuSpatialError::GpuNotAvailable);
+
+        #[cfg(gpu_available)]
+        self.index
+            .as_mut()
+            .ok_or_else(|| GpuSpatialError::Init("GPU index not available".into()))?
+            .finish_building()
     }
 
-    /// Perform spatial join between two geometry arrays
-    pub fn spatial_join(
-        &mut self,
-        left_geom: arrow_array::ArrayRef,
-        right_geom: arrow_array::ArrayRef,
-        predicate: SpatialPredicate,
-    ) -> Result<(Vec<u32>, Vec<u32>)> {
+    pub fn probe(&self, rects: &[Rect<f32>]) -> Result<(Vec<u32>, Vec<u32>)> {
         #[cfg(not(gpu_available))]
         {
-            let _ = (left_geom, right_geom, predicate);
+            let _ = rects;
             Err(GpuSpatialError::GpuNotAvailable)
         }
 
         #[cfg(gpu_available)]
         {
-            if !self.initialized {
-                return Err(GpuSpatialError::Init("Context not initialized".into()));
-            }
+            let index = self
+                .index
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU index not available".into()))?;
 
-            let joiner = self
-                .joiner
-                .as_mut()
-                .ok_or_else(|| GpuSpatialError::Init("GPU joiner not available".into()))?;
+            let mut ctx = SedonaSpatialIndexContext {
+                private_data: std::ptr::null_mut(),
+            };
+            index.create_context(&mut ctx);
 
-            // Clear previous build data
-            joiner.clear();
-
-            // Push build data (left side)
-            log::info!(
-                "DEBUG: Pushing {} geometries to GPU (build side)",
-                left_geom.len()
-            );
-            log::info!("DEBUG: Left array data type: {:?}", left_geom.data_type());
-            if let Some(binary_arr) = left_geom
-                .as_any()
-                .downcast_ref::<arrow_array::BinaryArray>()
-            {
-                log::info!("DEBUG: Left binary array has {} values", binary_arr.len());
-                if binary_arr.len() > 0 {
-                    let first_wkb = binary_arr.value(0);
-                    log::info!(
-                        "DEBUG: First left WKB length: {}, first bytes: {:?}",
-                        first_wkb.len(),
-                        &first_wkb[..8.min(first_wkb.len())]
-                    );
+            let result = (|| -> Result<(Vec<u32>, Vec<u32>)> {
+                unsafe {
+                    // If this fails, it returns Err from the *closure*, not the function
+                    index.probe(&mut ctx, rects.as_ptr() as *const f32, rects.len() as u32)?;
                 }
-            }
 
-            joiner.push_build(&left_geom, 0, left_geom.len() as i64)?;
-            joiner.finish_building()?;
+                // Copy results
+                let build_indices = index.get_build_indices_buffer(&mut ctx).to_vec();
+                let probe_indices = index.get_probe_indices_buffer(&mut ctx).to_vec();
 
-            // Recreate context after building (required by libgpuspatial)
-            let mut new_context = libgpuspatial_glue_bindgen::GpuSpatialJoinerContext {
-                last_error: std::ptr::null(),
-                private_data: std::ptr::null_mut(),
-                build_indices: std::ptr::null_mut(),
-                stream_indices: std::ptr::null_mut(),
-            };
-            joiner.create_context(&mut new_context);
-            self.context = Some(new_context);
-            let context = self.context.as_mut().unwrap();
-            // Push stream data (right side) and perform join
-            let gpu_predicate = predicate.into();
-            joiner.push_stream(
-                context,
-                &right_geom,
-                0,
-                right_geom.len() as i64,
-                gpu_predicate,
-                0, // array_index_offset
-            )?;
+                Ok((build_indices, probe_indices))
+            })();
 
-            // Get results
-            let build_indices = joiner.get_build_indices_buffer(context).to_vec();
-            let stream_indices = joiner.get_stream_indices_buffer(context).to_vec();
+            index.destroy_context(&mut ctx);
 
-            Ok((build_indices, stream_indices))
+            result
         }
     }
-}
 
-/// Spatial predicates for GPU operations
-#[repr(u32)]
-#[derive(Debug, PartialEq, Copy, Clone)]
-pub enum SpatialPredicate {
-    Equals = 0,
-    Disjoint = 1,
-    Touches = 2,
-    Contains = 3,
-    Covers = 4,
-    Intersects = 5,
-    Within = 6,
-    CoveredBy = 7,
-}
+    pub fn refiner_clear(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner is not available".into()))?;
 
-#[cfg(gpu_available)]
-impl From<SpatialPredicate> for GpuSpatialPredicateWrapper {
-    fn from(pred: SpatialPredicate) -> Self {
-        match pred {
-            SpatialPredicate::Equals => GpuSpatialPredicateWrapper::Equals,
-            SpatialPredicate::Disjoint => GpuSpatialPredicateWrapper::Disjoint,
-            SpatialPredicate::Touches => GpuSpatialPredicateWrapper::Touches,
-            SpatialPredicate::Contains => GpuSpatialPredicateWrapper::Contains,
-            SpatialPredicate::Covers => GpuSpatialPredicateWrapper::Covers,
-            SpatialPredicate::Intersects => GpuSpatialPredicateWrapper::Intersects,
-            SpatialPredicate::Within => GpuSpatialPredicateWrapper::Within,
-            SpatialPredicate::CoveredBy => GpuSpatialPredicateWrapper::CoveredBy,
+            // Clear previous build data
+            refiner.clear();
+            Ok(())
         }
     }
-}
 
-// Cleanup implementation
-impl Drop for GpuSpatialContext {
-    fn drop(&mut self) {
+    pub fn refiner_push_build(&mut self, array: &arrow_array::ArrayRef) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = array;
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
         #[cfg(gpu_available)]
         {
-            if let (Some(mut joiner), Some(mut ctx)) = (self.joiner.take(), self.context.take()) {
-                joiner.destroy_context(&mut ctx);
-                joiner.release();
-            }
+            let refiner = self
+                .refiner
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.push_build(array)
+        }
+    }
+
+    pub fn refiner_finish_building(&mut self) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_mut()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.finish_building()
+        }
+    }
+
+    pub fn refine_loaded(
+        &self,
+        probe_array: &arrow_array::ArrayRef,
+        predicate: GpuSpatialRelationPredicate,
+        build_indices: &mut Vec<u32>,
+        probe_indices: &mut Vec<u32>,
+    ) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = (probe_array, predicate, build_indices, probe_indices);
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.refine_loaded(
+                probe_array,
+                GpuSpatialRelationPredicateWrapper::from(predicate),
+                build_indices,
+                probe_indices,
+            )
+        }
+    }
+
+    pub fn refine(
+        &self,
+        array1: &arrow_array::ArrayRef,
+        array2: &arrow_array::ArrayRef,
+        predicate: GpuSpatialRelationPredicate,
+        indices1: &mut Vec<u32>,
+        indices2: &mut Vec<u32>,
+    ) -> Result<()> {
+        #[cfg(not(gpu_available))]
+        {
+            let _ = (array1, array2, predicate, indices1, indices2);
+            Err(GpuSpatialError::GpuNotAvailable)
+        }
+        #[cfg(gpu_available)]
+        {
+            let refiner = self
+                .refiner
+                .as_ref()
+                .ok_or_else(|| GpuSpatialError::Init("GPU refiner not available".into()))?;
+
+            refiner.refine(
+                array1,
+                array2,
+                GpuSpatialRelationPredicateWrapper::from(predicate),
+                indices1,
+                indices2,
+            )
         }
     }
 }
 
+#[cfg(gpu_available)]
 #[cfg(test)]
 mod tests {
     use super::*;
+    use geo::{BoundingRect, Intersects, Point, Polygon};
+    use sedona_expr::scalar_udf::SedonaScalarUDF;
+    use sedona_geos::register::scalar_kernels;
+    use sedona_schema::crs::lnglat;
+    use sedona_schema::datatypes::{Edges, SedonaType, WKB_GEOMETRY};
+    use sedona_testing::create::create_array_storage;
+    use sedona_testing::testers::ScalarUdfTester;
+    use wkt::TryFromWkt;
+
+    pub fn find_intersection_pairs(
+        vec_a: &[Rect<f32>],
+        vec_b: &[Rect<f32>],
+    ) -> (Vec<u32>, Vec<u32>) {
+        let mut ids_a = Vec::new();
+        let mut ids_b = Vec::new();
+
+        // Iterate through A with index 'i'
+        for (i, rect_a) in vec_a.iter().enumerate() {
+            // Only proceed if 'a' exists
+            // Iterate through B with index 'j'
+            for (j, rect_b) in vec_b.iter().enumerate() {
+                // Check if 'b' exists and intersects 'a'
+                if rect_a.intersects(rect_b) {
+                    ids_a.push(i as u32);
+                    ids_b.push(j as u32);
+                }
+            }
+        }
 
+        (ids_a, ids_b)
+    }
     #[test]
-    fn test_context_creation() {
-        let ctx = GpuSpatialContext::new();
-        #[cfg(gpu_available)]
-        assert!(ctx.is_ok());
-        #[cfg(not(gpu_available))]
-        assert!(ctx.is_err());
+    fn test_spatial_index() {
+        let mut gs = GpuSpatial::new().unwrap();
+        let options = GpuSpatialOptions {
+            concurrency: 1,
+            device_id: 0,
+            compress_bvh: false,
+            pipeline_batches: 1,
+            cuda_init_memory_pool_ratio: 0.1,
+        };
+        gs.init(options).expect("Failed to initialize GpuSpatial");
+
+        let polygon_values =  &[
+            Some("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"),
+            Some("POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"),
+            Some("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 3 3, 2 3, 2 2), (6 6, 8 6, 8 8, 6 8, 6 6))"),
+            Some("POLYGON ((30 0, 60 20, 50 50, 10 50, 0 20, 30 0), (20 30, 25 40, 15 40, 20 30), (30 30, 35 40, 25 40, 30 30), (40 30, 45 40, 35 40, 40 30))"),
+            Some("POLYGON ((40 0, 50 30, 80 20, 90 70, 60 90, 30 80, 20 40, 40 0), (50 20, 65 30, 60 50, 45 40, 50 20), (30 60, 50 70, 45 80, 30 60))"),
+        ];
+        let rects: Vec<Rect<f32>> = polygon_values
+            .iter()
+            .filter_map(|opt_wkt| {
+                let wkt_str = opt_wkt.as_ref()?;
+                let polygon: Polygon<f32> = Polygon::try_from_wkt_str(wkt_str).ok()?;
+
+                polygon.bounding_rect()
+            })
+            .collect();
+        gs.index_push_build(&rects)
+            .expect("Failed to push build data");
+        gs.index_finish_building()
+            .expect("Failed to finish building");
+        let point_values = &[
+            Some("POINT (30 20)"),
+            Some("POINT (20 20)"),
+            Some("POINT (1 1)"),
+            Some("POINT (70 70)"),
+            Some("POINT (55 35)"),
+        ];
+        let points: Vec<Rect<f32>> = point_values
+            .iter()
+            .map(|opt_wkt| -> Rect<f32> {
+                let wkt_str = opt_wkt.unwrap();
+                let point: Point<f32> = Point::try_from_wkt_str(wkt_str).ok().unwrap();
+                point.bounding_rect()
+            })
+            .collect();
+        let (mut build_indices, mut probe_indices) = gs.probe(&points).unwrap();
+        build_indices.sort();
+        probe_indices.sort();
+
+        let (mut ans_build_indices, mut ans_probe_indices) =
+            find_intersection_pairs(&rects, &points);
+
+        ans_build_indices.sort();
+        ans_probe_indices.sort();
+
+        assert_eq!(build_indices, ans_build_indices);
+        assert_eq!(probe_indices, ans_probe_indices);
+    }
+
+    #[test]
+    fn test_spatial_refiner() {
+        let mut gs = GpuSpatial::new().unwrap();
+        let options = GpuSpatialOptions {
+            concurrency: 1,
+            device_id: 0,
+            compress_bvh: false,
+            pipeline_batches: 1,
+            cuda_init_memory_pool_ratio: 0.1,
+        };
+        gs.init(options).expect("Failed to initialize GpuSpatial");
+
+        let polygon_values =  &[
+            Some("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"),
+            Some("POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"),
+            Some("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 3 3, 2 3, 2 2), (6 6, 8 6, 8 8, 6 8, 6 6))"),
+            Some("POLYGON ((30 0, 60 20, 50 50, 10 50, 0 20, 30 0), (20 30, 25 40, 15 40, 20 30), (30 30, 35 40, 25 40, 30 30), (40 30, 45 40, 35 40, 40 30))"),
+            Some("POLYGON ((40 0, 50 30, 80 20, 90 70, 60 90, 30 80, 20 40, 40 0), (50 20, 65 30, 60 50, 45 40, 50 20), (30 60, 50 70, 45 80, 30 60))"),
+        ];
+        let polygons = create_array_storage(polygon_values, &WKB_GEOMETRY);
+
+        let rects: Vec<Rect<f32>> = polygon_values
+            .iter()
+            .map(|opt_wkt| -> Rect<f32> {
+                let wkt_str = opt_wkt.unwrap();
+                let polygon: Polygon<f32> = Polygon::try_from_wkt_str(wkt_str).ok().unwrap();
+                polygon.bounding_rect().unwrap()
+            })
+            .collect();
+        gs.index_push_build(&rects)
+            .expect("Failed to push build data");
+        gs.index_finish_building()
+            .expect("Failed to finish building");
+        let point_values = &[
+            Some("POINT (30 20)"),
+            Some("POINT (20 20)"),
+            Some("POINT (1 1)"),
+            Some("POINT (70 70)"),
+            Some("POINT (55 35)"),
+        ];
+        let points = create_array_storage(point_values, &WKB_GEOMETRY);
+        let point_rects: Vec<Rect<f32>> = point_values
+            .iter()
+            .map(|wkt| -> Rect<f32> {
+                let wkt_str = wkt.unwrap();
+
+                let point: Point<f32> = Point::try_from_wkt_str(wkt_str).unwrap();
+
+                point.bounding_rect()
+            })
+            .collect();
+        let (mut build_indices, mut probe_indices) = gs.probe(&point_rects).unwrap();
+
+        gs.refine(
+            &polygons,
+            &points,
+            GpuSpatialRelationPredicate::Intersects,
+            &mut build_indices,
+            &mut probe_indices,
+        )
+        .expect("Failed to refine results");
+
+        build_indices.sort();
+        probe_indices.sort();
+
+        let kernels = scalar_kernels();
+
+        // Iterate through the vector and find the one named "st_intersects"
+        let st_intersects = kernels
+            .into_iter()
+            .find(|(name, _)| *name == "st_intersects")
+            .map(|(_, kernel_ref)| kernel_ref)
+            .unwrap();
+
+        let sedona_type = SedonaType::Wkb(Edges::Planar, lnglat());
+        let udf = SedonaScalarUDF::from_impl("st_intersects", st_intersects);
+        let tester =
+            ScalarUdfTester::new(udf.into(), vec![sedona_type.clone(), sedona_type.clone()]);
+
+        let mut ans_build_indices: Vec<u32> = Vec::new();
+        let mut ans_probe_indices: Vec<u32> = Vec::new();
+
+        for (poly_index, poly) in polygon_values.iter().enumerate() {
+            for (point_index, point) in point_values.iter().enumerate() {
+                let result = tester
+                    .invoke_scalar_scalar(poly.unwrap(), point.unwrap())
+                    .unwrap();
+                if result == true.into() {
+                    ans_build_indices.push(poly_index as u32);
+                    ans_probe_indices.push(point_index as u32);
+                }
+            }
+        }
+
+        ans_build_indices.sort();
+        ans_probe_indices.sort();
+
+        assert_eq!(build_indices, ans_build_indices);
+        assert_eq!(probe_indices, ans_probe_indices);
     }
 }
diff --git a/c/sedona-libgpuspatial/src/libgpuspatial.rs b/c/sedona-libgpuspatial/src/libgpuspatial.rs
index 414b92e09..3c7ecf32e 100644
--- a/c/sedona-libgpuspatial/src/libgpuspatial.rs
+++ b/c/sedona-libgpuspatial/src/libgpuspatial.rs
@@ -17,106 +17,141 @@
 
 use crate::error::GpuSpatialError;
 use crate::libgpuspatial_glue_bindgen::*;
-use arrow_array::{ffi::FFI_ArrowArray, ArrayRef};
+use arrow_array::{ffi::FFI_ArrowArray, Array, ArrayRef};
+use arrow_schema::ffi::FFI_ArrowSchema;
 use std::convert::TryFrom;
 use std::ffi::CString;
 use std::mem::transmute;
-use std::os::raw::{c_uint, c_void};
+use std::os::raw::c_uint;
+use std::sync::{Arc, Mutex};
 
-pub struct GpuSpatialJoinerWrapper {
-    joiner: GpuSpatialJoiner,
+pub struct GpuSpatialRuntimeWrapper {
+    runtime: GpuSpatialRuntime,
 }
 
-#[repr(u32)]
-#[derive(Debug, PartialEq, Copy, Clone)]
-pub enum GpuSpatialPredicateWrapper {
-    Equals = 0,
-    Disjoint = 1,
-    Touches = 2,
-    Contains = 3,
-    Covers = 4,
-    Intersects = 5,
-    Within = 6,
-    CoveredBy = 7,
-}
+impl GpuSpatialRuntimeWrapper {
+    /// # Initializes the GpuSpatialRuntime
+    /// This function should only be called once per engine instance.
+    /// # Arguments
+    /// * `device_id` - The GPU device ID to use.
+    /// * `ptx_root` - The root directory for PTX files.
+    pub fn try_new(
+        device_id: i32,
+        ptx_root: &str,
+        cuda_init_memory_pool_ratio: f32,
+    ) -> Result<GpuSpatialRuntimeWrapper, GpuSpatialError> {
+        let mut runtime = GpuSpatialRuntime {
+            init: None,
+            release: None,
+            get_last_error: None,
+            private_data: std::ptr::null_mut(),
+        };
 
-impl TryFrom<c_uint> for GpuSpatialPredicateWrapper {
-    type Error = &'static str;
+        unsafe {
+            // Set function pointers to the C functions
+            GpuSpatialRuntimeCreate(&mut runtime);
+        }
 
-    fn try_from(v: c_uint) -> Result<Self, Self::Error> {
-        match v {
-            0 => Ok(GpuSpatialPredicateWrapper::Equals),
-            1 => Ok(GpuSpatialPredicateWrapper::Disjoint),
-            2 => Ok(GpuSpatialPredicateWrapper::Touches),
-            3 => Ok(GpuSpatialPredicateWrapper::Contains),
-            4 => Ok(GpuSpatialPredicateWrapper::Covers),
-            5 => Ok(GpuSpatialPredicateWrapper::Intersects),
-            6 => Ok(GpuSpatialPredicateWrapper::Within),
-            7 => Ok(GpuSpatialPredicateWrapper::CoveredBy),
-            _ => Err("Invalid GpuSpatialPredicate value"),
+        if let Some(init_fn) = runtime.init {
+            let c_ptx_root = CString::new(ptx_root).expect("CString::new failed");
+
+            let mut config = GpuSpatialRuntimeConfig {
+                device_id,
+                ptx_root: c_ptx_root.as_ptr(),
+                cuda_init_memory_pool_ratio,
+            };
+
+            // This is an unsafe call because it's calling a C function from the bindings.
+            unsafe {
+                if init_fn(&runtime as *const _ as *mut _, &mut config) != 0 {
+                    let error_message =
+                        runtime.get_last_error.unwrap()(&runtime as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    return Err(GpuSpatialError::Init(error_string));
+                }
+            }
         }
-    }
-}
 
-impl Default for GpuSpatialJoinerWrapper {
-    fn default() -> Self {
-        Self::new()
+        Ok(GpuSpatialRuntimeWrapper { runtime })
     }
 }
 
-impl GpuSpatialJoinerWrapper {
-    pub fn new() -> Self {
-        GpuSpatialJoinerWrapper {
-            joiner: GpuSpatialJoiner {
+impl Default for GpuSpatialRuntimeWrapper {
+    fn default() -> Self {
+        GpuSpatialRuntimeWrapper {
+            runtime: GpuSpatialRuntime {
                 init: None,
-                clear: None,
-                create_context: None,
-                destroy_context: None,
-                push_build: None,
-                finish_building: None,
-                push_stream: None,
-                get_build_indices_buffer: None,
-                get_stream_indices_buffer: None,
                 release: None,
+                get_last_error: None,
                 private_data: std::ptr::null_mut(),
-                last_error: std::ptr::null(),
             },
         }
     }
+}
 
+impl Drop for GpuSpatialRuntimeWrapper {
+    fn drop(&mut self) {
+        // Call the release function if it exists
+        if let Some(release_fn) = self.runtime.release {
+            unsafe {
+                release_fn(&mut self.runtime as *mut _);
+            }
+        }
+    }
+}
+
+pub struct GpuSpatialIndexFloat2DWrapper {
+    index: SedonaFloatIndex2D,
+    _runtime: Arc<Mutex<GpuSpatialRuntimeWrapper>>, // Keep a reference to the RT engine to ensure it lives as long as the index
+}
+
+impl GpuSpatialIndexFloat2DWrapper {
     /// # Initializes the GpuSpatialJoiner
     /// This function should only be called once per joiner instance.
     ///
     /// # Arguments
+    /// * `runtime` - The GPUSpatial runtime to use for GPU operations.
     /// * `concurrency` - How many threads will call the joiner concurrently.
-    /// * `ptx_root` - The root directory for PTX files.
-    pub fn init(&mut self, concurrency: u32, ptx_root: &str) -> Result<(), GpuSpatialError> {
-        let joiner_ptr: *mut GpuSpatialJoiner = &mut self.joiner;
+    pub fn try_new(
+        runtime: &Arc<Mutex<GpuSpatialRuntimeWrapper>>,
+        concurrency: u32,
+    ) -> Result<Self, GpuSpatialError> {
+        let mut index = SedonaFloatIndex2D {
+            clear: None,
+            create_context: None,
+            destroy_context: None,
+            push_build: None,
+            finish_building: None,
+            probe: None,
+            get_build_indices_buffer: None,
+            get_probe_indices_buffer: None,
+            get_last_error: None,
+            context_get_last_error: None,
+            release: None,
+            private_data: std::ptr::null_mut(),
+        };
+        let mut engine_guard = runtime
+            .lock()
+            .map_err(|_| GpuSpatialError::Init("Failed to acquire mutex lock".to_string()))?;
+        let config = GpuSpatialIndexConfig {
+            runtime: &mut engine_guard.runtime,
+            concurrency,
+        };
 
         unsafe {
             // Set function pointers to the C functions
-            GpuSpatialJoinerCreate(joiner_ptr);
-        }
-
-        if let Some(init_fn) = self.joiner.init {
-            let c_ptx_root = CString::new(ptx_root).expect("CString::new failed");
-
-            let mut config = GpuSpatialJoinerConfig {
-                concurrency,
-                ptx_root: c_ptx_root.as_ptr(),
-            };
-
-            // This is an unsafe call because it's calling a C function from the bindings.
-            unsafe {
-                if init_fn(&self.joiner as *const _ as *mut _, &mut config) != 0 {
-                    let error_message = self.joiner.last_error;
-                    let c_str = std::ffi::CStr::from_ptr(error_message);
-                    let error_string = c_str.to_string_lossy().into_owned();
-                    return Err(GpuSpatialError::Init(error_string));
-                }
+            if GpuSpatialIndexFloat2DCreate(&mut index, &config) != 0 {
+                let error_message = index.get_last_error.unwrap()(&runtime as *const _ as *mut _);
+                let c_str = std::ffi::CStr::from_ptr(error_message);
+                let error_string = c_str.to_string_lossy().into_owned();
+                return Err(GpuSpatialError::Init(error_string));
             }
         }
-        Ok(())
+        Ok(GpuSpatialIndexFloat2DWrapper {
+            index,
+            _runtime: runtime.clone(),
+        })
     }
 
     /// # Clears the GpuSpatialJoiner
@@ -126,69 +161,43 @@ impl GpuSpatialJoinerWrapper {
     /// instead of building a new one because creating a new joiner is expensive.
     /// **This method is not thread-safe and should be called from a single thread.**
     pub fn clear(&mut self) {
-        if let Some(clear_fn) = self.joiner.clear {
+        if let Some(clear_fn) = self.index.clear {
             unsafe {
-                clear_fn(&mut self.joiner as *mut _);
+                clear_fn(&mut self.index as *mut _);
             }
         }
     }
 
-    /// # Pushes an array of WKBs to the build side of the joiner
+    /// # Pushes an array of rectangles to the build side of the joiner
     /// This function can be called multiple times to push multiple arrays.
-    /// The joiner will internally parse the WKBs and build a spatial index.
+    /// The joiner will internally parse the rectangles and build a spatial index.
     /// After pushing all build data, you must call `finish_building()` to build the
     /// spatial index.
     /// **This method is not thread-safe and should be called from a single thread.**
     /// # Arguments
-    /// * `array` - The array of WKBs to push.
-    /// * `offset` - The offset of the array to push.
-    /// * `length` - The length of the array to push.
-    pub fn push_build(
+    /// * `buf` - The array pointer to the rectangles to push.
+    /// * `n_rects` - The number of rectangles in the array.
+    /// # Safety
+    /// This function is unsafe because it takes a raw pointer to the rectangles.
+    ///
+    pub unsafe fn push_build(
         &mut self,
-        array: &ArrayRef,
-        offset: i64,
-        length: i64,
+        buf: *const f32,
+        n_rects: u32,
     ) -> Result<(), GpuSpatialError> {
-        log::info!(
-            "DEBUG FFI: push_build called with offset={}, length={}",
-            offset,
-            length
-        );
-        log::info!(
-            "DEBUG FFI: Array length={}, null_count={}",
-            array.len(),
-            array.null_count()
-        );
-
-        // 1. Convert the single ArrayRef to its FFI representation
-        let (ffi_array, _) = arrow_array::ffi::to_ffi(&array.to_data())?;
-
-        log::info!("DEBUG FFI: FFI conversion successful");
-        log::info!("DEBUG FFI: FFI array null_count={}", ffi_array.null_count());
-
-        // 2. Get the raw pointer to the FFI_ArrowArray struct
-        // let arrow_ptr = &mut ffi_array as *mut FFI_ArrowArray as *mut ArrowArray;
+        log::debug!("DEBUG FFI: push_build called with length={}", n_rects);
 
-        if let Some(push_build_fn) = self.joiner.push_build {
+        if let Some(push_build_fn) = self.index.push_build {
             unsafe {
-                let ffi_array_ptr: *const ArrowArray =
-                    transmute(&ffi_array as *const FFI_ArrowArray);
-                log::info!("DEBUG FFI: Calling C++ push_build function");
-                if push_build_fn(
-                    &mut self.joiner as *mut _,
-                    std::ptr::null_mut(), // schema is unused currently
-                    ffi_array_ptr as *mut _,
-                    offset,
-                    length,
-                ) != 0
-                {
-                    let error_message = self.joiner.last_error;
+                if push_build_fn(&mut self.index as *mut _, buf, n_rects) != 0 {
+                    let error_message =
+                        self.index.get_last_error.unwrap()(&mut self.index as *mut _);
                     let c_str = std::ffi::CStr::from_ptr(error_message);
                     let error_string = c_str.to_string_lossy().into_owned();
                     log::error!("DEBUG FFI: push_build failed: {}", error_string);
                     return Err(GpuSpatialError::PushBuild(error_string));
                 }
-                log::info!("DEBUG FFI: push_build C++ call succeeded");
+                log::debug!("DEBUG FFI: push_build C++ call succeeded");
             }
         }
         Ok(())
@@ -201,10 +210,11 @@ impl GpuSpatialJoinerWrapper {
     /// for spatial join operations.
     /// **This method is not thread-safe and should be called from a single thread.**
     pub fn finish_building(&mut self) -> Result<(), GpuSpatialError> {
-        if let Some(finish_building_fn) = self.joiner.finish_building {
+        if let Some(finish_building_fn) = self.index.finish_building {
             unsafe {
-                if finish_building_fn(&mut self.joiner as *mut _) != 0 {
-                    let error_message = self.joiner.last_error;
+                if finish_building_fn(&mut self.index as *mut _) != 0 {
+                    let error_message =
+                        self.index.get_last_error.unwrap()(&mut self.index as *mut _);
                     let c_str = std::ffi::CStr::from_ptr(error_message);
                     let error_string = c_str.to_string_lossy().into_owned();
                     return Err(GpuSpatialError::FinishBuild(error_string));
@@ -224,89 +234,73 @@ impl GpuSpatialJoinerWrapper {
     /// The context can be destroyed by calling the `destroy_context` function pointer in the `GpuSpatialJoiner` struct.
     /// The context should be destroyed before destroying the joiner.
     /// **This method is thread-safe.**
-    pub fn create_context(&mut self, ctx: &mut GpuSpatialJoinerContext) {
-        if let Some(create_context_fn) = self.joiner.create_context {
+    pub fn create_context(&self, ctx: &mut SedonaSpatialIndexContext) {
+        if let Some(create_context_fn) = self.index.create_context {
             unsafe {
-                create_context_fn(&mut self.joiner as *mut _, ctx as *mut _);
+                // Cast the shared reference to a raw pointer, then to a mutable raw pointer
+                create_context_fn(ctx as *mut _);
             }
         }
     }
 
-    pub fn destroy_context(&mut self, ctx: &mut GpuSpatialJoinerContext) {
-        if let Some(destroy_context_fn) = self.joiner.destroy_context {
+    pub fn destroy_context(&self, ctx: &mut SedonaSpatialIndexContext) {
+        if let Some(destroy_context_fn) = self.index.destroy_context {
             unsafe {
                 destroy_context_fn(ctx as *mut _);
             }
         }
     }
 
-    pub fn push_stream(
-        &mut self,
-        ctx: &mut GpuSpatialJoinerContext,
-        array: &ArrayRef,
-        offset: i64,
-        length: i64,
-        predicate: GpuSpatialPredicateWrapper,
-        array_index_offset: i32,
+    /// # Probes an array of rectangles against the built spatial index
+    /// This function probes an array of rectangles against the spatial index built
+    /// using `push_build()` and `finish_building()`. It finds all pairs of rectangles
+    /// that satisfy the spatial relation defined by the index.
+    /// The results are stored in the context passed to the function.
+    /// **This method is thread-safe if each thread uses its own context.**
+    /// # Arguments
+    /// * `ctx` - The context for the thread performing the spatial join.
+    /// * `buf` - A pointer to the array of rectangles to probe.
+    /// * `n_rects` - The number of rectangles in the array.
+    /// # Safety
+    /// This function is unsafe because it takes a raw pointer to the rectangles.
+    pub unsafe fn probe(
+        &self,
+        ctx: &mut SedonaSpatialIndexContext,
+        buf: *const f32,
+        n_rects: u32,
     ) -> Result<(), GpuSpatialError> {
-        log::info!(
-            "DEBUG FFI: push_stream called with offset={}, length={}, predicate={:?}",
-            offset,
-            length,
-            predicate
-        );
-        log::info!(
-            "DEBUG FFI: Array length={}, null_count={}",
-            array.len(),
-            array.null_count()
-        );
-
-        // 1. Convert the single ArrayRef to its FFI representation
-        let (ffi_array, _) = arrow_array::ffi::to_ffi(&array.to_data())?;
-
-        log::info!("DEBUG FFI: FFI conversion successful");
-        log::info!("DEBUG FFI: FFI array null_count={}", ffi_array.null_count());
-
-        // 2. Get the raw pointer to the FFI_ArrowArray struct
-        // let arrow_ptr = &mut ffi_array as *mut FFI_ArrowArray as *mut ArrowArray;
+        log::debug!("DEBUG FFI: probe called with length={}", n_rects);
 
-        if let Some(push_stream_fn) = self.joiner.push_stream {
+        if let Some(probe_fn) = self.index.probe {
             unsafe {
-                let ffi_array_ptr: *const ArrowArray =
-                    transmute(&ffi_array as *const FFI_ArrowArray);
-                log::info!("DEBUG FFI: Calling C++ push_stream function");
-                if push_stream_fn(
-                    &mut self.joiner as *mut _,
+                if probe_fn(
+                    &self.index as *const _ as *mut _,
                     ctx as *mut _,
-                    std::ptr::null_mut(), // schema is unused currently
-                    ffi_array_ptr as *mut _,
-                    offset,
-                    length,
-                    predicate as c_uint,
-                    array_index_offset,
+                    buf,
+                    n_rects,
                 ) != 0
                 {
-                    let error_message = ctx.last_error;
+                    let error_message = self.index.context_get_last_error.unwrap()(ctx);
                     let c_str = std::ffi::CStr::from_ptr(error_message);
                     let error_string = c_str.to_string_lossy().into_owned();
-                    log::error!("DEBUG FFI: push_stream failed: {}", error_string);
-                    return Err(GpuSpatialError::PushStream(error_string));
+                    log::error!("DEBUG FFI: probe failed: {}", error_string);
+                    return Err(GpuSpatialError::Probe(error_string));
                 }
-                log::info!("DEBUG FFI: push_stream C++ call succeeded");
+                log::debug!("DEBUG FFI: probe C++ call succeeded");
             }
         }
         Ok(())
     }
 
-    pub fn get_build_indices_buffer(&self, ctx: &mut GpuSpatialJoinerContext) -> &[u32] {
-        if let Some(get_build_indices_buffer_fn) = self.joiner.get_build_indices_buffer {
-            let mut build_indices_ptr: *mut c_void = std::ptr::null_mut();
+    pub fn get_build_indices_buffer(&self, ctx: &mut SedonaSpatialIndexContext) -> &[u32] {
+        if let Some(get_build_indices_buffer_fn) = self.index.get_build_indices_buffer {
+            let mut build_indices_ptr: *mut u32 = std::ptr::null_mut();
             let mut build_indices_len: u32 = 0;
 
             unsafe {
                 get_build_indices_buffer_fn(
                     ctx as *mut _,
-                    &mut build_indices_ptr as *mut *mut c_void,
+                    &mut build_indices_ptr as *mut *mut u32,
                     &mut build_indices_len as *mut u32,
                 );
 
@@ -331,179 +325,387 @@ impl GpuSpatialJoinerWrapper {
         &[]
     }
 
-    pub fn get_stream_indices_buffer(&self, ctx: &mut GpuSpatialJoinerContext) -> &[u32] {
-        if let Some(get_stream_indices_buffer_fn) = self.joiner.get_stream_indices_buffer {
-            let mut stream_indices_ptr: *mut c_void = std::ptr::null_mut();
-            let mut stream_indices_len: u32 = 0;
+    pub fn get_probe_indices_buffer(&self, ctx: &mut SedonaSpatialIndexContext) -> &[u32] {
+        if let Some(get_probe_indices_buffer_fn) = self.index.get_probe_indices_buffer {
+            let mut probe_indices_ptr: *mut u32 = std::ptr::null_mut();
+            let mut probe_indices_len: u32 = 0;
 
             unsafe {
-                get_stream_indices_buffer_fn(
+                get_probe_indices_buffer_fn(
                     ctx as *mut _,
-                    &mut stream_indices_ptr as *mut *mut c_void,
-                    &mut stream_indices_len as *mut u32,
+                    &mut probe_indices_ptr as *mut *mut u32,
+                    &mut probe_indices_len as *mut u32,
                 );
 
                 // Check length first - empty vectors return empty slice
-                if stream_indices_len == 0 {
+                if probe_indices_len == 0 {
                     return &[];
                 }
 
                 // Validate pointer (should not be null if length > 0)
-                if stream_indices_ptr.is_null() {
+                if probe_indices_ptr.is_null() {
                     return &[];
                 }
 
                 // Convert the raw pointer to a slice. This is safe to do because
                 // we've validated the pointer is non-null and length is valid.
-                let typed_ptr = stream_indices_ptr as *const u32;
+                let typed_ptr = probe_indices_ptr as *const u32;
 
                 // Safety: We've checked ptr is non-null and len > 0
-                return std::slice::from_raw_parts(typed_ptr, stream_indices_len as usize);
+                return std::slice::from_raw_parts(typed_ptr, probe_indices_len as usize);
             }
         }
         &[]
     }
+}
 
-    pub fn release(&mut self) {
-        // Call the release function if it exists
-        if let Some(release_fn) = self.joiner.release {
-            unsafe {
-                release_fn(&mut self.joiner as *mut _);
-            }
+impl Default for GpuSpatialIndexFloat2DWrapper {
+    fn default() -> Self {
+        GpuSpatialIndexFloat2DWrapper {
+            index: SedonaFloatIndex2D {
+                clear: None,
+                create_context: None,
+                destroy_context: None,
+                push_build: None,
+                finish_building: None,
+                probe: None,
+                get_build_indices_buffer: None,
+                get_probe_indices_buffer: None,
+                get_last_error: None,
+                context_get_last_error: None,
+                release: None,
+                private_data: std::ptr::null_mut(),
+            },
+            _runtime: Arc::new(Mutex::new(GpuSpatialRuntimeWrapper::default())),
         }
     }
 }
 
-impl Drop for GpuSpatialJoinerWrapper {
+impl Drop for GpuSpatialIndexFloat2DWrapper {
     fn drop(&mut self) {
         // Call the release function if it exists
-        if let Some(release_fn) = self.joiner.release {
+        if let Some(release_fn) = self.index.release {
             unsafe {
-                release_fn(&mut self.joiner as *mut _);
+                release_fn(&mut self.index as *mut _);
             }
         }
     }
 }
 
-#[cfg(test)]
-mod test {
-    use super::*;
-    use sedona_expr::scalar_udf::SedonaScalarUDF;
-    use sedona_geos::register::scalar_kernels;
-    use sedona_schema::crs::lnglat;
-    use sedona_schema::datatypes::{Edges, SedonaType, WKB_GEOMETRY};
-    use sedona_testing::create::create_array_storage;
-    use sedona_testing::testers::ScalarUdfTester;
-    use std::env;
-    use std::path::PathBuf;
-
-    #[test]
-    fn test_gpu_joiner_end2end() {
-        let mut joiner = GpuSpatialJoinerWrapper::new();
-
-        let out_path = PathBuf::from(env::var("OUT_DIR").unwrap());
-        let ptx_root = out_path.join("share/gpuspatial/shaders");
-
-        joiner
-            .init(
-                1,
-                ptx_root.to_str().expect("Failed to convert path to string"),
-            )
-            .expect("Failed to init GpuSpatialJoiner");
-
-        let polygon_values =  &[
-            Some("POLYGON ((30 10, 40 40, 20 40, 10 20, 30 10))"),
-            Some("POLYGON ((35 10, 45 45, 15 40, 10 20, 35 10), (20 30, 35 35, 30 20, 20 30))"),
-            Some("POLYGON ((0 0, 10 0, 10 10, 0 10, 0 0), (2 2, 3 2, 3 3, 2 3, 2 2), (6 6, 8 6, 8 8, 6 8, 6 6))"),
-            Some("POLYGON ((30 0, 60 20, 50 50, 10 50, 0 20, 30 0), (20 30, 25 40, 15 40, 20 30), (30 30, 35 40, 25 40, 30 30), (40 30, 45 40, 35 40, 40 30))"),
-            Some("POLYGON ((40 0, 50 30, 80 20, 90 70, 60 90, 30 80, 20 40, 40 0), (50 20, 65 30, 60 50, 45 40, 50 20), (30 60, 50 70, 45 80, 30 60))"),
-        ];
-        let polygons = create_array_storage(polygon_values, &WKB_GEOMETRY);
-
-        // Let the gpusaptial joiner to parse WKBs and get building boxes
-        joiner
-            .push_build(&polygons, 0, polygons.len().try_into().unwrap())
-            .expect("Failed to push building");
-        // Build a spatial index for Build internally on GPU
-        joiner.finish_building().expect("Failed to finish building");
-
-        // Each thread that performs spatial joins should have its own context.
-        // The context is passed to PushStream calls to perform spatial joins.
-        let mut ctx = GpuSpatialJoinerContext {
-            last_error: std::ptr::null(),
+#[repr(u32)]
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub enum GpuSpatialRelationPredicateWrapper {
+    Equals = 0,
+    Disjoint = 1,
+    Touches = 2,
+    Contains = 3,
+    Covers = 4,
+    Intersects = 5,
+    Within = 6,
+    CoveredBy = 7,
+}
+
+impl TryFrom<c_uint> for GpuSpatialRelationPredicateWrapper {
+    type Error = &'static str;
+
+    fn try_from(v: c_uint) -> Result<Self, Self::Error> {
+        match v {
+            0 => Ok(GpuSpatialRelationPredicateWrapper::Equals),
+            1 => Ok(GpuSpatialRelationPredicateWrapper::Disjoint),
+            2 => Ok(GpuSpatialRelationPredicateWrapper::Touches),
+            3 => Ok(GpuSpatialRelationPredicateWrapper::Contains),
+            4 => Ok(GpuSpatialRelationPredicateWrapper::Covers),
+            5 => Ok(GpuSpatialRelationPredicateWrapper::Intersects),
+            6 => Ok(GpuSpatialRelationPredicateWrapper::Within),
+            7 => Ok(GpuSpatialRelationPredicateWrapper::CoveredBy),
+            _ => Err("Invalid GpuSpatialPredicate value"),
+        }
+    }
+}
+
+pub struct GpuSpatialRefinerWrapper {
+    refiner: SedonaSpatialRefiner,
+    _runtime: Arc<Mutex<GpuSpatialRuntimeWrapper>>, // Keep a reference to the RT engine to ensure it lives as long as the refiner
+}
+
+impl GpuSpatialRefinerWrapper {
+    /// # Initializes the GpuSpatialJoiner
+    /// This function should only be called once per joiner instance.
+    ///
+    /// # Arguments
+    /// * `concurrency` - How many threads will call the joiner concurrently.
+    /// * `ptx_root` - The root directory for PTX files.
+    pub fn try_new(
+        runtime: &Arc<Mutex<GpuSpatialRuntimeWrapper>>,
+        concurrency: u32,
+        compress_bvh: bool,
+        pipeline_batches: u32,
+    ) -> Result<Self, GpuSpatialError> {
+        let mut refiner = SedonaSpatialRefiner {
+            clear: None,
+            push_build: None,
+            finish_building: None,
+            refine_loaded: None,
+            refine: None,
+            get_last_error: None,
+            release: None,
             private_data: std::ptr::null_mut(),
-            build_indices: std::ptr::null_mut(),
-            stream_indices: std::ptr::null_mut(),
         };
+        let mut engine_guard = runtime
+            .lock()
+            .map_err(|_| GpuSpatialError::Init("Failed to acquire mutex lock".to_string()))?;
+        let config = GpuSpatialRefinerConfig {
+            runtime: &mut engine_guard.runtime,
+            concurrency,
+            compress_bvh,
+            pipeline_batches,
+        };
+        unsafe {
+            // Set function pointers to the C functions
+            if GpuSpatialRefinerCreate(&mut refiner, &config) != 0 {
+                let error_message = refiner.get_last_error.unwrap()(&refiner as *const _ as *mut _);
+                let c_str = std::ffi::CStr::from_ptr(error_message);
+                let error_string = c_str.to_string_lossy().into_owned();
+                return Err(GpuSpatialError::Init(error_string));
+            }
+        }
+        Ok(GpuSpatialRefinerWrapper {
+            refiner,
+            _runtime: runtime.clone(),
+        })
+    }
+
+    pub fn clear(&self) {
+        log::debug!("DEBUG FFI: clear called");
+        if let Some(clear_fn) = self.refiner.clear {
+            unsafe {
+                clear_fn(&self.refiner as *const _ as *mut _);
+            }
+            log::debug!("DEBUG FFI: clear completed");
+        }
+    }
 
-        joiner.create_context(&mut ctx);
-
-        let point_values = &[
-            Some("POINT (30 20)"), // poly0
-            Some("POINT (20 20)"), // poly1
-            Some("POINT (1 1)"),   // poly2
-            Some("POINT (70 70)"),
-            Some("POINT (55 35)"), // poly4
-        ];
-        let points = create_array_storage(point_values, &WKB_GEOMETRY);
-
-        // array_index_offset offsets the result of stream indices
-        let array_index_offset = 0;
-        joiner
-            .push_stream(
-                &mut ctx,
-                &points,
-                0,
-                points.len().try_into().unwrap(),
-                GpuSpatialPredicateWrapper::Intersects,
-                array_index_offset,
-            )
-            .expect("Failed to push building");
-
-        let build_indices = joiner.get_build_indices_buffer(&mut ctx);
-        let stream_indices = joiner.get_stream_indices_buffer(&mut ctx);
-
-        let mut result_pairs: Vec<(u32, u32)> = Vec::new();
-
-        for (build_index, stream_index) in build_indices.iter().zip(stream_indices.iter()) {
-            result_pairs.push((*build_index, *stream_index));
+    /// # Loads a build array into the GPU spatial refiner
+    /// This function loads an array of geometries into the GPU spatial refiner
+    /// for parsing and loading on the GPU side.
+    /// # Arguments
+    /// * `array` - The array of geometries to load.
+    /// # Returns
+    /// * `Result<(), GpuSpatialError>` - Ok if successful, Err if an error occurred.
+    pub fn push_build(&self, array: &ArrayRef) -> Result<(), GpuSpatialError> {
+        log::debug!("DEBUG FFI: push_build called with array={}", array.len(),);
+
+        let (ffi_array, ffi_schema) = arrow_array::ffi::to_ffi(&array.to_data())?;
+        log::debug!("DEBUG FFI: FFI conversion successful");
+        if let Some(load_fn) = self.refiner.push_build {
+            unsafe {
+                let ffi_array_ptr: *const ArrowArray =
+                    transmute(&ffi_array as *const FFI_ArrowArray);
+                let ffi_schema_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema as *const FFI_ArrowSchema);
+                log::debug!("DEBUG FFI: Calling C++ refine function");
+                let _new_len: u32 = 0;
+                if load_fn(
+                    &self.refiner as *const _ as *mut _,
+                    ffi_schema_ptr as *mut _,
+                    ffi_array_ptr as *mut _,
+                ) != 0
+                {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: push_build failed: {}", error_string);
+                    return Err(GpuSpatialError::PushBuild(error_string));
+                }
+                log::debug!("DEBUG FFI: push_build C++ call succeeded");
+            }
+        }
+        Ok(())
+    }
+
+    pub fn finish_building(&self) -> Result<(), GpuSpatialError> {
+        log::debug!("DEBUG FFI: finish_building called");
+
+        if let Some(finish_building_fn) = self.refiner.finish_building {
+            unsafe {
+                if finish_building_fn(&self.refiner as *const _ as *mut _) != 0 {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: finish_building failed: {}", error_string);
+                    return Err(GpuSpatialError::FinishBuild(error_string));
+                }
+                log::debug!("DEBUG FFI: finish_building C++ call succeeded");
+            }
         }
+        Ok(())
+    }
 
-        let kernels = scalar_kernels();
-
-        // Iterate through the vector and find the one named "st_intersects"
-        let st_intersects = kernels
-            .into_iter()
-            .find(|(name, _)| *name == "st_intersects")
-            .map(|(_, kernel_ref)| kernel_ref)
-            .unwrap();
-
-        let sedona_type = SedonaType::Wkb(Edges::Planar, lnglat());
-        let udf = SedonaScalarUDF::from_kernel("st_intersects", st_intersects);
-        let tester =
-            ScalarUdfTester::new(udf.into(), vec![sedona_type.clone(), sedona_type.clone()]);
-
-        let mut answer_pairs: Vec<(u32, u32)> = Vec::new();
-
-        for (poly_index, poly) in polygon_values.iter().enumerate() {
-            for (point_index, point) in point_values.iter().enumerate() {
-                let result = tester
-                    .invoke_scalar_scalar(poly.unwrap(), point.unwrap())
-                    .unwrap();
-                if result == true.into() {
-                    answer_pairs.push((poly_index as u32, point_index as u32));
+    /// # Refines candidate pairs using the GPU spatial refiner
+    /// This function refines candidate pairs of geometries using the GPU spatial refiner.
+    /// It takes the probe side array of geometries and a predicate, and outputs the refined pairs of
+    /// indices that satisfy the predicate.
+    /// # Arguments
+    /// * `array` - The array of geometries on the probe side.
+    /// * `predicate` - The spatial relation predicate to use for refinement.
+    /// * `build_indices` - The input/output vector of indices for the first array.
+    /// * `probe_indices` - The input/output vector of indices for the second array.
+    /// # Returns
+    /// * `Result<(), GpuSpatialError>` - Ok if successful, Err if an error occurred.
+    pub fn refine_loaded(
+        &self,
+        array: &ArrayRef,
+        predicate: GpuSpatialRelationPredicateWrapper,
+        build_indices: &mut Vec<u32>,
+        probe_indices: &mut Vec<u32>,
+    ) -> Result<(), GpuSpatialError> {
+        log::debug!(
+            "DEBUG FFI: refine called with array={}, indices={}, predicate={:?}",
+            array.len(),
+            build_indices.len(),
+            predicate
+        );
+
+        let (ffi_array, ffi_schema) = arrow_array::ffi::to_ffi(&array.to_data())?;
+
+        log::debug!("DEBUG FFI: FFI conversion successful");
+
+        if let Some(refine_fn) = self.refiner.refine_loaded {
+            unsafe {
+                let ffi_array_ptr: *const ArrowArray =
+                    transmute(&ffi_array as *const FFI_ArrowArray);
+                let ffi_schema_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema as *const FFI_ArrowSchema);
+                log::debug!("DEBUG FFI: Calling C++ refine function");
+                let mut new_len: u32 = 0;
+                if refine_fn(
+                    &self.refiner as *const _ as *mut _,
+                    ffi_schema_ptr as *mut _,
+                    ffi_array_ptr as *mut _,
+                    predicate as c_uint,
+                    build_indices.as_mut_ptr(),
+                    probe_indices.as_mut_ptr(),
+                    build_indices.len() as u32,
+                    &mut new_len as *mut u32,
+                ) != 0
+                {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: refine failed: {}", error_string);
+                    return Err(GpuSpatialError::Refine(error_string));
                 }
+                log::debug!("DEBUG FFI: refine C++ call succeeded");
+                // Update the lengths of the output index vectors
+                build_indices.truncate(new_len as usize);
+                probe_indices.truncate(new_len as usize);
             }
         }
+        Ok(())
+    }
+    /// # Refines candidate pairs using the GPU spatial refiner
+    /// This function refines candidate pairs of geometries using the GPU spatial refiner.
+    /// It takes two arrays of geometries and a predicate, and outputs the refined pairs of
+    /// indices that satisfy the predicate.
+    /// # Arguments
+    /// * `array1` - The first array of geometries.
+    /// * `array2` - The second array of geometries.
+    /// * `predicate` - The spatial relation predicate to use for refinement.
+    /// * `indices1` - The input/output vector of indices for the first array.
+    /// * `indices2` - The input/output vector of indices for the second array.
+    /// # Returns
+    /// * `Result<(), GpuSpatialError>` - Ok if successful, Err if an error occurred.
+    pub fn refine(
+        &self,
+        array1: &ArrayRef,
+        array2: &ArrayRef,
+        predicate: GpuSpatialRelationPredicateWrapper,
+        indices1: &mut Vec<u32>,
+        indices2: &mut Vec<u32>,
+    ) -> Result<(), GpuSpatialError> {
+        log::debug!(
+            "DEBUG FFI: refine called with array1={}, array2={}, indices={}, predicate={:?}",
+            array1.len(),
+            array2.len(),
+            indices1.len(),
+            predicate
+        );
 
-        // Sort both vectors. The default sort on tuples compares element by element.
-        result_pairs.sort();
-        answer_pairs.sort();
+        let (ffi_array1, ffi_schema1) = arrow_array::ffi::to_ffi(&array1.to_data())?;
+        let (ffi_array2, ffi_schema2) = arrow_array::ffi::to_ffi(&array2.to_data())?;
 
-        // Assert that the two sorted vectors are equal.
-        assert_eq!(result_pairs, answer_pairs);
+        log::debug!("DEBUG FFI: FFI conversion successful");
 
-        joiner.destroy_context(&mut ctx);
-        joiner.release();
+        if let Some(refine_fn) = self.refiner.refine {
+            unsafe {
+                let ffi_array1_ptr: *const ArrowArray =
+                    transmute(&ffi_array1 as *const FFI_ArrowArray);
+                let ffi_schema1_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema1 as *const FFI_ArrowSchema);
+                let ffi_array2_ptr: *const ArrowArray =
+                    transmute(&ffi_array2 as *const FFI_ArrowArray);
+                let ffi_schema2_ptr: *const ArrowSchema =
+                    transmute(&ffi_schema2 as *const FFI_ArrowSchema);
+                log::debug!("DEBUG FFI: Calling C++ refine function");
+                let mut new_len: u32 = 0;
+                if refine_fn(
+                    &self.refiner as *const _ as *mut _,
+                    ffi_schema1_ptr as *mut _,
+                    ffi_array1_ptr as *mut _,
+                    ffi_schema2_ptr as *mut _,
+                    ffi_array2_ptr as *mut _,
+                    predicate as c_uint,
+                    indices1.as_mut_ptr(),
+                    indices2.as_mut_ptr(),
+                    indices1.len() as u32,
+                    &mut new_len as *mut u32,
+                ) != 0
+                {
+                    let error_message =
+                        self.refiner.get_last_error.unwrap()(&self.refiner as *const _ as *mut _);
+                    let c_str = std::ffi::CStr::from_ptr(error_message);
+                    let error_string = c_str.to_string_lossy().into_owned();
+                    log::error!("DEBUG FFI: refine failed: {}", error_string);
+                    return Err(GpuSpatialError::Refine(error_string));
+                }
+                log::debug!("DEBUG FFI: refine C++ call succeeded");
+                // Update the lengths of the output index vectors
+                indices1.truncate(new_len as usize);
+                indices2.truncate(new_len as usize);
+            }
+        }
+        Ok(())
+    }
+}
+
+impl Default for GpuSpatialRefinerWrapper {
+    fn default() -> Self {
+        GpuSpatialRefinerWrapper {
+            refiner: SedonaSpatialRefiner {
+                clear: None,
+                push_build: None,
+                finish_building: None,
+                refine_loaded: None,
+                refine: None,
+                get_last_error: None,
+                release: None,
+                private_data: std::ptr::null_mut(),
+            },
+            _runtime: Arc::new(Mutex::new(GpuSpatialRuntimeWrapper::default())),
+        }
+    }
+}
+
+impl Drop for GpuSpatialRefinerWrapper {
+    fn drop(&mut self) {
+        // Call the release function if it exists
+        if let Some(release_fn) = self.refiner.release {
+            unsafe {
+                release_fn(&mut self.refiner as *mut _);
+            }
+        }
     }
 }
diff --git a/python/sedonadb/Cargo.toml b/python/sedonadb/Cargo.toml
index 426bed90e..e92d76934 100644
--- a/python/sedonadb/Cargo.toml
+++ b/python/sedonadb/Cargo.toml
@@ -29,6 +29,7 @@ crate-type = ["cdylib"]
 default = ["mimalloc"]
 mimalloc = ["dep:mimalloc", "dep:libmimalloc-sys"]
 s2geography = ["sedona/s2geography"]
+gpu = ["sedona/gpu"]
 
 [dependencies]
 adbc_core = { workspace = true }
@@ -54,3 +55,4 @@ thiserror = { workspace = true }
 tokio = { workspace = true }
 mimalloc = { workspace = true, optional = true }
 libmimalloc-sys = { workspace = true, optional = true }
+env_logger = { workspace = true }
diff --git a/python/sedonadb/src/lib.rs b/python/sedonadb/src/lib.rs
index 6a316964e..387a5eaef 100644
--- a/python/sedonadb/src/lib.rs
+++ b/python/sedonadb/src/lib.rs
@@ -98,6 +98,7 @@ fn configure_proj_shared(
 #[pymodule]
 fn _lib(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
     #[cfg(feature = "mimalloc")]
+    env_logger::init();
     configure_tg_allocator();
 
     m.add_function(wrap_pyfunction!(configure_proj_shared, m)?)?;
diff --git a/rust/sedona-common/src/option.rs b/rust/sedona-common/src/option.rs
index bc74acf74..440152f8a 100644
--- a/rust/sedona-common/src/option.rs
+++ b/rust/sedona-common/src/option.rs
@@ -77,6 +77,35 @@ config_namespace! {
         /// of spawning parallel tasks. Higher values reduce parallelization overhead
         /// for small datasets, while lower values enable more fine-grained parallelism.
         pub parallel_refinement_chunk_size: usize, default = 8192
+
+        /// GPU acceleration options
+        pub gpu: GpuOptions, default = GpuOptions::default()
+    }
+}
+
+config_namespace! {
+    /// Configuration options for GPU-accelerated spatial joins
+    pub struct GpuOptions {
+        /// Enable GPU-accelerated spatial joins (requires CUDA and GPU feature flag)
+        pub enable: bool, default = false
+
+        // Concatenate all geometries on the build-side into a single buffer for GPU processing
+        pub concat_build: bool, default = true
+
+        /// GPU device ID to use (0 = first GPU, 1 = second, etc.)
+        pub device_id: usize, default = 0
+
+        /// Fall back to CPU if GPU initialization or execution fails
+        pub fallback_to_cpu: bool, default = true
+
+        /// Overlapping parsing and refinement by pipelining multiple batches; 1 means no pipelining
+        pub pipeline_batches: usize, default = 1
+
+        /// Ratio of total GPU memory to initialize CUDA memory pool (between 0% and 100%)
+        pub init_memory_pool_percentage: usize, default = 50
+
+        /// Compress BVH to reduce memory usage for processing larger datasets at the cost of some performance
+        pub compress_bvh: bool, default = false
     }
 }
 
diff --git a/rust/sedona-spatial-join/Cargo.toml b/rust/sedona-spatial-join/Cargo.toml
index 9831c59b1..b771b8aae 100644
--- a/rust/sedona-spatial-join/Cargo.toml
+++ b/rust/sedona-spatial-join/Cargo.toml
@@ -31,12 +31,16 @@ rust-version.workspace = true
 result_large_err = "allow"
 
 [features]
+default = []
 backtrace = ["datafusion-common/backtrace"]
+# Enable GPU acceleration (requires CUDA toolkit and sedona-libgpuspatial with gpu feature)
+gpu = ["sedona-libgpuspatial/gpu"]
 
 [dependencies]
 arrow = { workspace = true }
 arrow-schema = { workspace = true }
 arrow-array = { workspace = true }
+async-trait = { workspace = true }
 datafusion = { workspace = true }
 datafusion-common = { workspace = true }
 datafusion-expr = { workspace = true }
@@ -66,12 +70,14 @@ geo-index = { workspace = true }
 geos = { workspace = true }
 float_next_after = { workspace = true }
 fastrand = { workspace = true }
+log = "0.4"
+sedona-libgpuspatial = { workspace = true }
 
 [dev-dependencies]
 criterion = { workspace = true }
 datafusion = { workspace = true, features = ["sql"] }
 rstest = { workspace = true }
-sedona-testing = { workspace = true}
+sedona-testing = { workspace = true }
 wkt = { workspace = true }
 tokio = { workspace = true, features = ["macros"] }
 rand = { workspace = true }
diff --git a/rust/sedona-spatial-join/src/build_index.rs b/rust/sedona-spatial-join/src/build_index.rs
index f369365c5..03e1b3da1 100644
--- a/rust/sedona-spatial-join/src/build_index.rs
+++ b/rust/sedona-spatial-join/src/build_index.rs
@@ -24,11 +24,10 @@ use datafusion_expr::JoinType;
 use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
 use sedona_common::SedonaOptions;
 
+use crate::index::gpu_spatial_index_builder::GPUSpatialIndexBuilder;
+use crate::index::spatial_index::{SpatialIndexRef, SpatialJoinBuildMetrics};
 use crate::{
-    index::{
-        BuildSideBatchesCollector, CollectBuildSideMetrics, SpatialIndex, SpatialIndexBuilder,
-        SpatialJoinBuildMetrics,
-    },
+    index::{BuildSideBatchesCollector, CPUSpatialIndexBuilder, CollectBuildSideMetrics},
     operand_evaluator::create_operand_evaluator,
     spatial_predicate::SpatialPredicate,
 };
@@ -39,7 +38,7 @@ use crate::{
 /// to determine whether to collect build side partitions concurrently (using spawned tasks)
 /// or sequentially (for JNI/embedded contexts without async runtime support).
 #[allow(clippy::too_many_arguments)]
-pub async fn build_index(
+pub(crate) async fn build_index(
     context: Arc<TaskContext>,
     build_schema: SchemaRef,
     build_streams: Vec<SendableRecordBatchStream>,
@@ -47,7 +46,8 @@ pub async fn build_index(
     join_type: JoinType,
     probe_threads_count: usize,
     metrics: ExecutionPlanMetricsSet,
-) -> Result<SpatialIndex> {
+    use_gpu: bool,
+) -> Result<SpatialIndexRef> {
     let session_config = context.session_config();
     let sedona_options = session_config
         .options()
@@ -79,17 +79,33 @@ pub async fn build_index(
         .iter()
         .any(|partition| partition.build_side_batch_stream.is_external());
     if !contains_external_stream {
-        let mut index_builder = SpatialIndexBuilder::new(
-            build_schema,
-            spatial_predicate,
-            sedona_options.spatial_join,
-            join_type,
-            probe_threads_count,
-            Arc::clone(memory_pool),
-            SpatialJoinBuildMetrics::new(0, &metrics),
-        )?;
-        index_builder.add_partitions(build_partitions).await?;
-        index_builder.finish()
+        if use_gpu {
+            log::info!("Start building GPU spatial index for build side.");
+            let mut index_builder = GPUSpatialIndexBuilder::new(
+                build_schema,
+                spatial_predicate,
+                sedona_options.spatial_join,
+                join_type,
+                probe_threads_count,
+                Arc::clone(memory_pool),
+                SpatialJoinBuildMetrics::new(0, &metrics),
+            );
+            index_builder.add_partitions(build_partitions).await?;
+            index_builder.finish()
+        } else {
+            log::info!("Start building CPU spatial index for build side.");
+            let mut index_builder = CPUSpatialIndexBuilder::new(
+                build_schema,
+                spatial_predicate,
+                sedona_options.spatial_join,
+                join_type,
+                probe_threads_count,
+                Arc::clone(memory_pool),
+                SpatialJoinBuildMetrics::new(0, &metrics),
+            )?;
+            index_builder.add_partitions(build_partitions).await?;
+            index_builder.finish()
+        }
     } else {
         Err(DataFusionError::ResourcesExhausted("Memory limit exceeded while collecting indexed data. External spatial index builder is not yet implemented.".to_string()))
     }
diff --git a/rust/sedona-spatial-join/src/exec.rs b/rust/sedona-spatial-join/src/exec.rs
index 43b73290c..626a027b0 100644
--- a/rust/sedona-spatial-join/src/exec.rs
+++ b/rust/sedona-spatial-join/src/exec.rs
@@ -34,9 +34,9 @@ use datafusion_physical_plan::{
 };
 use parking_lot::Mutex;
 
+use crate::index::spatial_index::SpatialIndexRef;
 use crate::{
     build_index::build_index,
-    index::SpatialIndex,
     spatial_predicate::{KNNPredicate, SpatialPredicate},
     stream::{SpatialJoinProbeMetrics, SpatialJoinStream},
     utils::join_utils::{asymmetric_join_output_partitioning, boundedness_from_children},
@@ -133,10 +133,13 @@ pub struct SpatialJoinExec {
     cache: PlanProperties,
     /// Spatial index built asynchronously on first execute() call and shared across all partitions.
     /// Uses OnceAsync for lazy initialization coordinated via async runtime.
-    once_async_spatial_index: Arc<Mutex<Option<OnceAsync<SpatialIndex>>>>,
+    once_async_spatial_index: Arc<Mutex<Option<OnceAsync<SpatialIndexRef>>>>,
     /// Indicates if this SpatialJoin was converted from a HashJoin
     /// When true, we preserve HashJoin's equivalence properties and partitioning
     converted_from_hash_join: bool,
+    /// Whether to use GPU acceleration for this physical execution plan
+    /// The value of this field is determined in the optimizer
+    use_gpu: bool,
 }
 
 impl SpatialJoinExec {
@@ -148,11 +151,15 @@ impl SpatialJoinExec {
         filter: Option<JoinFilter>,
         join_type: &JoinType,
         projection: Option<Vec<usize>>,
+        use_gpu: bool,
     ) -> Result<Self> {
-        Self::try_new_with_options(left, right, on, filter, join_type, projection, false)
+        Self::try_new_with_options(
+            left, right, on, filter, join_type, projection, false, use_gpu,
+        )
     }
 
     /// Create a new SpatialJoinExec with additional options
+    #[allow(clippy::too_many_arguments)]
     pub fn try_new_with_options(
         left: Arc<dyn ExecutionPlan>,
         right: Arc<dyn ExecutionPlan>,
@@ -161,6 +168,7 @@ impl SpatialJoinExec {
         join_type: &JoinType,
         projection: Option<Vec<usize>>,
         converted_from_hash_join: bool,
+        use_gpu: bool,
     ) -> Result<Self> {
         let left_schema = left.schema();
         let right_schema = right.schema();
@@ -192,6 +200,7 @@ impl SpatialJoinExec {
             cache,
             once_async_spatial_index: Arc::new(Mutex::new(None)),
             converted_from_hash_join,
+            use_gpu,
         })
     }
 
@@ -419,6 +428,7 @@ impl ExecutionPlan for SpatialJoinExec {
             cache: self.cache.clone(),
             once_async_spatial_index: Arc::new(Mutex::new(None)),
             converted_from_hash_join: self.converted_from_hash_join,
+            use_gpu: self.use_gpu,
         }))
     }
 
@@ -464,6 +474,7 @@ impl ExecutionPlan for SpatialJoinExec {
 
                             let probe_thread_count =
                                 self.right.output_partitioning().partition_count();
+
                             Ok(build_index(
                                 Arc::clone(&context),
                                 build_side.schema(),
@@ -472,6 +483,7 @@ impl ExecutionPlan for SpatialJoinExec {
                                 self.join_type,
                                 probe_thread_count,
                                 self.metrics.clone(),
+                                self.use_gpu,
                             ))
                         })?
                 };
@@ -563,6 +575,7 @@ impl SpatialJoinExec {
                         self.join_type,
                         probe_thread_count,
                         self.metrics.clone(),
+                        false, // GPU not supported for KNN joins yet
                     ))
                 })?
         };
@@ -625,14 +638,13 @@ mod tests {
     use sedona_testing::datagen::RandomPartitionedDataBuilder;
     use tokio::sync::OnceCell;
 
+    use super::*;
     use crate::register_spatial_join_optimizer;
     use sedona_common::{
         option::{add_sedona_option_extension, ExecutionMode, SpatialJoinOptions},
-        SpatialLibrary,
+        GpuOptions, SpatialLibrary,
     };
 
-    use super::*;
-
     type TestPartitions = (SchemaRef, Vec<Vec<RecordBatch>>);
 
     /// Creates standard test data with left (Polygon) and right (Point) partitions
@@ -757,8 +769,7 @@ mod tests {
         Ok(ctx)
     }
 
-    #[tokio::test]
-    async fn test_empty_data() -> Result<()> {
+    async fn test_empty_data(use_gpu: bool) -> Result<()> {
         let schema = Arc::new(Schema::new(vec![
             Field::new("id", DataType::Int32, false),
             Field::new("dist", DataType::Float64, false),
@@ -769,6 +780,10 @@ mod tests {
 
         let options = SpatialJoinOptions {
             execution_mode: ExecutionMode::PrepareNone,
+            gpu: GpuOptions {
+                enable: use_gpu,
+                ..GpuOptions::default()
+            },
             ..Default::default()
         };
         let ctx = setup_context(Some(options.clone()), 10)?;
@@ -801,6 +816,17 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_empty_data_cpu() -> Result<()> {
+        test_empty_data(false).await
+    }
+
+    #[cfg(feature = "gpu")]
+    #[tokio::test]
+    async fn test_empty_data_gpu() -> Result<()> {
+        test_empty_data(true).await
+    }
+
     // Shared test data and expected results - computed only once across all parameterized test cases
     // Using tokio::sync::OnceCell for async lazy initialization to avoid recomputing expensive
     // test data generation and nested loop join results for each test parameter combination
@@ -916,6 +942,40 @@ mod tests {
         Ok(())
     }
 
+    #[rstest]
+    #[tokio::test]
+    #[cfg(feature = "gpu")]
+    async fn test_range_join_gpu(#[values(10, 30, 1000)] max_batch_size: usize) -> Result<()> {
+        let test_data = get_default_test_data().await;
+        let expected_results = get_expected_range_join_results().await;
+        let ((left_schema, left_partitions), (right_schema, right_partitions)) = test_data;
+
+        let options = SpatialJoinOptions {
+            spatial_library: SpatialLibrary::Tg,        // Doesn't matter
+            execution_mode: ExecutionMode::PrepareNone, // Doesn't matter
+            gpu: GpuOptions {
+                enable: true,
+                ..GpuOptions::default()
+            },
+            ..Default::default()
+        };
+        for (idx, sql) in RANGE_JOIN_SQLS.iter().enumerate() {
+            let actual_result = run_spatial_join_query(
+                left_schema,
+                right_schema,
+                left_partitions.clone(),
+                right_partitions.clone(),
+                Some(options.clone()),
+                max_batch_size,
+                sql,
+            )
+            .await?;
+            assert_eq!(&actual_result, &expected_results[idx]);
+        }
+
+        Ok(())
+    }
+
     #[rstest]
     #[tokio::test]
     async fn test_distance_join_with_conf(
@@ -948,49 +1008,107 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test]
-    async fn test_spatial_join_with_filter() -> Result<()> {
+    async fn test_spatial_join_with_filter(use_gpu: bool) -> Result<()> {
         let ((left_schema, left_partitions), (right_schema, right_partitions)) =
             create_test_data_with_size_range((0.1, 10.0), WKB_GEOMETRY)?;
 
         for max_batch_size in [10, 30, 100] {
             let options = SpatialJoinOptions {
                 execution_mode: ExecutionMode::PrepareNone,
+                gpu: GpuOptions {
+                    enable: use_gpu,
+                    ..GpuOptions::default()
+                },
                 ..Default::default()
             };
-            test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, max_batch_size,
-                "SELECT * FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY L.id, R.id").await?;
-            test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, max_batch_size,
-                "SELECT L.id l_id, R.id r_id FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY l_id, r_id").await?;
-            test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, max_batch_size,
-                "SELECT L.id l_id, R.id r_id, L.dist l_dist, R.dist r_dist FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY l_id, r_id").await?;
+
+            // Use clones of partitions because they are consumed by the test helper
+            test_spatial_join_query(
+                    &left_schema,
+                    &right_schema,
+                    left_partitions.clone(),
+                    right_partitions.clone(),
+                    &options,
+                    max_batch_size,
+                    "SELECT * FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY L.id, R.id"
+                ).await?;
+
+            test_spatial_join_query(
+                    &left_schema,
+                    &right_schema,
+                    left_partitions.clone(),
+                    right_partitions.clone(),
+                    &options,
+                    max_batch_size,
+                    "SELECT L.id l_id, R.id r_id FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY l_id, r_id"
+                ).await?;
+
+            test_spatial_join_query(
+                    &left_schema,
+                    &right_schema,
+                    left_partitions.clone(),
+                    right_partitions.clone(),
+                    &options,
+                    max_batch_size,
+                    "SELECT L.id l_id, R.id r_id, L.dist l_dist, R.dist r_dist FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY l_id, r_id"
+                ).await?;
         }
 
         Ok(())
     }
 
     #[tokio::test]
-    async fn test_range_join_with_empty_partitions() -> Result<()> {
+    async fn test_spatial_join_with_filter_cpu() -> Result<()> {
+        test_spatial_join_with_filter(false).await
+    }
+
+    #[cfg(feature = "gpu")]
+    #[tokio::test]
+    async fn test_spatial_join_with_filter_gpu() -> Result<()> {
+        test_spatial_join_with_filter(true).await
+    }
+
+    async fn test_range_join_with_empty_partitions(use_gpu: bool) -> Result<()> {
         let ((left_schema, left_partitions), (right_schema, right_partitions)) =
             create_test_data_with_empty_partitions()?;
-
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareNone,
+            gpu: GpuOptions {
+                enable: use_gpu,
+                ..GpuOptions::default()
+            },
+            ..Default::default()
+        };
         for max_batch_size in [10, 30, 1000] {
-            let options = SpatialJoinOptions {
-                execution_mode: ExecutionMode::PrepareNone,
-                ..Default::default()
-            };
             test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, max_batch_size,
-                "SELECT L.id l_id, R.id r_id FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) ORDER BY l_id, r_id").await?;
+                                        "SELECT L.id l_id, R.id r_id FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) ORDER BY l_id, r_id").await?;
             test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, max_batch_size,
-                "SELECT * FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) ORDER BY L.id, R.id").await?;
+                                        "SELECT * FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) ORDER BY L.id, R.id").await?;
         }
-
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_range_join_with_empty_partitions_cpu() -> Result<()> {
+        test_range_join_with_empty_partitions(false).await
+    }
+
+    #[cfg(feature = "gpu")]
+    #[tokio::test]
+    async fn test_range_join_with_empty_partitions_gpu() -> Result<()> {
+        test_range_join_with_empty_partitions(true).await
+    }
+
     #[tokio::test]
     async fn test_inner_join() -> Result<()> {
-        test_with_join_types(JoinType::Inner).await?;
+        test_with_join_types(JoinType::Inner, false).await?;
+        Ok(())
+    }
+
+    #[cfg(feature = "gpu")]
+    #[tokio::test]
+    async fn test_inner_join_gpu() -> Result<()> {
+        test_with_join_types(JoinType::Inner, true).await?;
         Ok(())
     }
 
@@ -999,7 +1117,17 @@ mod tests {
     async fn test_left_joins(
         #[values(JoinType::Left, JoinType::LeftSemi, JoinType::LeftAnti)] join_type: JoinType,
     ) -> Result<()> {
-        test_with_join_types(join_type).await?;
+        test_with_join_types(join_type, false).await?;
+        Ok(())
+    }
+
+    #[cfg(feature = "gpu")]
+    #[rstest]
+    #[tokio::test]
+    async fn test_left_joins_gpu(
+        #[values(JoinType::Left, JoinType::LeftSemi, JoinType::LeftAnti)] join_type: JoinType,
+    ) -> Result<()> {
+        test_with_join_types(join_type, true).await?;
         Ok(())
     }
 
@@ -1008,13 +1136,30 @@ mod tests {
     async fn test_right_joins(
         #[values(JoinType::Right, JoinType::RightSemi, JoinType::RightAnti)] join_type: JoinType,
     ) -> Result<()> {
-        test_with_join_types(join_type).await?;
+        test_with_join_types(join_type, false).await?;
+        Ok(())
+    }
+
+    #[cfg(feature = "gpu")]
+    #[rstest]
+    #[tokio::test]
+    async fn test_right_joins_gpu(
+        #[values(JoinType::Right, JoinType::RightSemi, JoinType::RightAnti)] join_type: JoinType,
+    ) -> Result<()> {
+        test_with_join_types(join_type, true).await?;
         Ok(())
     }
 
     #[tokio::test]
     async fn test_full_outer_join() -> Result<()> {
-        test_with_join_types(JoinType::Full).await?;
+        test_with_join_types(JoinType::Full, false).await?;
+        Ok(())
+    }
+
+    #[cfg(feature = "gpu")]
+    #[tokio::test]
+    async fn test_full_outer_join_gpu() -> Result<()> {
+        test_with_join_types(JoinType::Full, true).await?;
         Ok(())
     }
 
@@ -1024,12 +1169,24 @@ mod tests {
         #[values(JoinType::LeftMark, JoinType::RightMark)] join_type: JoinType,
     ) -> Result<()> {
         let options = SpatialJoinOptions::default();
-        test_mark_join(join_type, options, 10).await?;
+
+        test_mark_join(join_type, options.clone(), 10, false).await?;
         Ok(())
     }
 
+    #[cfg(feature = "gpu")]
+    #[rstest]
     #[tokio::test]
-    async fn test_mark_join_via_correlated_exists_sql() -> Result<()> {
+    async fn test_mark_joins_gpu(
+        #[values(JoinType::LeftMark, JoinType::RightMark)] join_type: JoinType,
+    ) -> Result<()> {
+        let options = SpatialJoinOptions::default();
+
+        test_mark_join(join_type, options, 10, true).await?;
+        Ok(())
+    }
+
+    async fn test_mark_join_via_correlated_exists_sql(use_gpu: bool) -> Result<()> {
         let ((left_schema, left_partitions), (right_schema, right_partitions)) =
             create_test_data_with_size_range((0.1, 10.0), WKB_GEOMETRY)?;
 
@@ -1050,7 +1207,13 @@ mod tests {
         let sql = "SELECT L.id FROM L WHERE L.id = 1 OR EXISTS (SELECT 1 FROM R WHERE ST_Intersects(L.geometry, R.geometry)) ORDER BY L.id";
 
         let batch_size = 10;
-        let options = SpatialJoinOptions::default();
+        let options = SpatialJoinOptions {
+            gpu: GpuOptions {
+                enable: use_gpu,
+                ..GpuOptions::default()
+            },
+            ..SpatialJoinOptions::default()
+        };
 
         // Optimized plan should include a SpatialJoinExec with Mark join type.
         let ctx = setup_context(Some(options), batch_size)?;
@@ -1094,6 +1257,17 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_mark_join_via_correlated_exists_sql_cpu() -> Result<()> {
+        test_mark_join_via_correlated_exists_sql(false).await
+    }
+
+    #[cfg(feature = "gpu")]
+    #[tokio::test]
+    async fn test_mark_join_via_correlated_exists_sql_gpu() -> Result<()> {
+        test_mark_join_via_correlated_exists_sql(true).await
+    }
+
     #[tokio::test]
     async fn test_geography_join_is_not_optimized() -> Result<()> {
         let options = SpatialJoinOptions::default();
@@ -1131,7 +1305,24 @@ mod tests {
             create_test_data_with_size_range((50.0, 60.0), WKB_GEOMETRY)?;
         let options = SpatialJoinOptions::default();
         test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, 10,
-                "SELECT id FROM L WHERE ST_Intersects(L.geometry, (SELECT R.geometry FROM R WHERE R.id = 1))").await?;
+                                "SELECT id FROM L WHERE ST_Intersects(L.geometry, (SELECT R.geometry FROM R WHERE R.id = 1))").await?;
+        Ok(())
+    }
+
+    #[cfg(feature = "gpu")]
+    #[tokio::test]
+    async fn test_query_window_in_subquery_gpu() -> Result<()> {
+        let ((left_schema, left_partitions), (right_schema, right_partitions)) =
+            create_test_data_with_size_range((50.0, 60.0), WKB_GEOMETRY)?;
+        let options = SpatialJoinOptions {
+            gpu: GpuOptions {
+                enable: true,
+                ..GpuOptions::default()
+            },
+            ..Default::default()
+        };
+        test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, 10,
+                                "SELECT id FROM L WHERE ST_Intersects(L.geometry, (SELECT R.geometry FROM R WHERE R.id = 1))").await?;
         Ok(())
     }
 
@@ -1147,18 +1338,21 @@ mod tests {
                 ..Default::default()
             };
             test_spatial_join_query(&left_schema, &right_schema, left_partitions.clone(), right_partitions.clone(), &options, max_batch_size,
-                "SELECT * FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY L.id, R.id").await?;
+                                    "SELECT * FROM L JOIN R ON ST_Intersects(L.geometry, R.geometry) AND L.dist < R.dist ORDER BY L.id, R.id").await?;
         }
 
         Ok(())
     }
 
-    async fn test_with_join_types(join_type: JoinType) -> Result<RecordBatch> {
+    async fn test_with_join_types(join_type: JoinType, use_gpu: bool) -> Result<RecordBatch> {
         let ((left_schema, left_partitions), (right_schema, right_partitions)) =
             create_test_data_with_empty_partitions()?;
-
         let options = SpatialJoinOptions {
             execution_mode: ExecutionMode::PrepareNone,
+            gpu: GpuOptions {
+                enable: use_gpu,
+                ..GpuOptions::default()
+            },
             ..Default::default()
         };
         let batch_size = 30;
@@ -1314,6 +1508,7 @@ mod tests {
         join_type: JoinType,
         options: SpatialJoinOptions,
         batch_size: usize,
+        use_gpu: bool,
     ) -> Result<()> {
         let ((left_schema, left_partitions), (right_schema, right_partitions)) =
             create_test_data_with_size_range((0.1, 10.0), WKB_GEOMETRY)?;
@@ -1338,6 +1533,7 @@ mod tests {
         let spatial_join_execs = collect_spatial_join_exec(&plan)?;
         assert_eq!(spatial_join_execs.len(), 1);
         let original_exec = spatial_join_execs[0];
+
         let mark_exec = SpatialJoinExec::try_new(
             original_exec.left.clone(),
             original_exec.right.clone(),
@@ -1345,6 +1541,7 @@ mod tests {
             original_exec.filter.clone(),
             &join_type,
             None,
+            use_gpu,
         )?;
 
         // Create NestedLoopJoinExec plan for comparison
diff --git a/rust/sedona-spatial-join/src/index.rs b/rust/sedona-spatial-join/src/index.rs
index 55df23d56..d25a601ea 100644
--- a/rust/sedona-spatial-join/src/index.rs
+++ b/rust/sedona-spatial-join/src/index.rs
@@ -16,15 +16,19 @@
 // under the License.
 
 pub(crate) mod build_side_collector;
+pub(crate) mod cpu_spatial_index;
+pub(crate) mod cpu_spatial_index_builder;
+pub(crate) mod gpu_spatial_index;
+pub(crate) mod gpu_spatial_index_builder;
 mod knn_adapter;
 pub(crate) mod spatial_index;
-pub(crate) mod spatial_index_builder;
 
 pub(crate) use build_side_collector::{
     BuildPartition, BuildSideBatchesCollector, CollectBuildSideMetrics,
 };
-pub use spatial_index::SpatialIndex;
-pub use spatial_index_builder::{SpatialIndexBuilder, SpatialJoinBuildMetrics};
+pub use cpu_spatial_index_builder::CPUSpatialIndexBuilder;
+pub(crate) use spatial_index::SpatialIndex;
+pub use spatial_index::SpatialJoinBuildMetrics;
 use wkb::reader::Wkb;
 
 /// The result of a spatial index query
diff --git a/rust/sedona-spatial-join/src/index/cpu_spatial_index.rs b/rust/sedona-spatial-join/src/index/cpu_spatial_index.rs
new file mode 100644
index 000000000..034bb23c3
--- /dev/null
+++ b/rust/sedona-spatial-join/src/index/cpu_spatial_index.rs
@@ -0,0 +1,1963 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::{
+    ops::Range,
+    sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    },
+};
+
+use arrow_array::RecordBatch;
+use arrow_schema::SchemaRef;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_common_runtime::JoinSet;
+use datafusion_execution::memory_pool::{MemoryPool, MemoryReservation};
+use float_next_after::NextAfter;
+use geo::BoundingRect;
+use geo_index::rtree::{
+    distance::{DistanceMetric, GeometryAccessor},
+    util::f64_box_to_f32,
+};
+use geo_index::rtree::{sort::HilbertSort, RTree, RTreeBuilder, RTreeIndex};
+use geo_index::IndexableNum;
+use geo_types::Rect;
+use parking_lot::Mutex;
+use sedona_expr::statistics::GeoStatistics;
+use sedona_geo::to_geo::item_to_geometry;
+use wkb::reader::Wkb;
+
+use crate::index::SpatialIndex;
+use crate::{
+    evaluated_batch::EvaluatedBatch,
+    index::{
+        knn_adapter::{KnnComponents, SedonaKnnAdapter},
+        IndexQueryResult, QueryResultMetrics,
+    },
+    operand_evaluator::{create_operand_evaluator, distance_value_at, OperandEvaluator},
+    refine::{create_refiner, IndexQueryResultRefiner},
+    spatial_predicate::SpatialPredicate,
+    utils::concurrent_reservation::ConcurrentReservation,
+};
+use arrow::array::BooleanBufferBuilder;
+use async_trait::async_trait;
+use sedona_common::{option::SpatialJoinOptions, sedona_internal_err, ExecutionMode};
+
+struct CPUSpatialIndexInner {
+    pub(crate) schema: SchemaRef,
+    pub(crate) options: SpatialJoinOptions,
+
+    /// The spatial predicate evaluator for the spatial predicate.
+    pub(crate) evaluator: Arc<dyn OperandEvaluator>,
+
+    /// The refiner for refining the index query results.
+    pub(crate) refiner: Arc<dyn IndexQueryResultRefiner>,
+
+    /// Memory reservation for tracking the memory usage of the refiner
+    pub(crate) refiner_reservation: ConcurrentReservation,
+
+    /// R-tree index for the geometry batches. It takes MBRs as query windows and returns
+    /// data indexes. These data indexes should be translated using `data_id_to_batch_pos` to get
+    /// the original geometry batch index and row index, or translated using `prepared_geom_idx_vec`
+    /// to get the prepared geometries array index.
+    pub(crate) rtree: RTree<f32>,
+
+    /// Indexed batches containing evaluated geometry arrays. It contains the original record
+    /// batches and geometry arrays obtained by evaluating the geometry expression on the build side.
+    pub(crate) indexed_batches: Vec<EvaluatedBatch>,
+    /// An array for translating rtree data index to geometry batch index and row index
+    pub(crate) data_id_to_batch_pos: Vec<(i32, i32)>,
+
+    /// An array for translating rtree data index to consecutive index. Each geometry may be indexed by
+    /// multiple boxes, so there could be multiple data indexes for the same geometry. A mapping for
+    /// squashing the index makes it easier for persisting per-geometry auxiliary data for evaluating
+    /// the spatial predicate. This is extensively used by the spatial predicate evaluators for storing
+    /// prepared geometries.
+    pub(crate) geom_idx_vec: Vec<usize>,
+
+    /// Shared bitmap builders for visited left indices, one per batch
+    pub(crate) visited_build_side: Option<Mutex<Vec<BooleanBufferBuilder>>>,
+
+    /// Counter of running probe-threads, potentially able to update `bitmap`.
+    /// Each time a probe thread finished probing the index, it will decrement the counter.
+    /// The last finished probe thread will produce the extra output batches for unmatched
+    /// build side when running left-outer joins. See also [`report_probe_completed`].
+    pub(crate) probe_threads_counter: AtomicUsize,
+
+    /// Shared KNN components (distance metrics and geometry cache) for efficient KNN queries
+    pub(crate) knn_components: Option<KnnComponents>,
+
+    /// Memory reservation for tracking the memory usage of the spatial index
+    /// Cleared on `SpatialIndex` drop
+    #[expect(dead_code)]
+    pub(crate) reservation: MemoryReservation,
+}
+
+#[derive(Clone)]
+pub struct CPUSpatialIndex {
+    inner: Arc<CPUSpatialIndexInner>,
+}
+
+impl CPUSpatialIndex {
+    pub fn empty(
+        spatial_predicate: SpatialPredicate,
+        schema: SchemaRef,
+        options: SpatialJoinOptions,
+        probe_threads_counter: AtomicUsize,
+        mut reservation: MemoryReservation,
+        memory_pool: Arc<dyn MemoryPool>,
+    ) -> Self {
+        let evaluator = create_operand_evaluator(&spatial_predicate, options.clone());
+        let refiner = create_refiner(
+            options.spatial_library,
+            &spatial_predicate,
+            options.clone(),
+            0,
+            GeoStatistics::empty(),
+        );
+        let refiner_reservation = reservation.split(0);
+        let refiner_reservation = ConcurrentReservation::try_new(0, refiner_reservation).unwrap();
+        let rtree = RTreeBuilder::<f32>::new(0).finish::<HilbertSort>();
+        let knn_components = matches!(spatial_predicate, SpatialPredicate::KNearestNeighbors(_))
+            .then(|| KnnComponents::new(0, &[], memory_pool.clone()).unwrap());
+        Self {
+            inner: Arc::new(CPUSpatialIndexInner {
+                schema,
+                options,
+                evaluator,
+                refiner,
+                refiner_reservation,
+                rtree,
+                data_id_to_batch_pos: Vec::new(),
+                indexed_batches: Vec::new(),
+                geom_idx_vec: Vec::new(),
+                visited_build_side: None,
+                probe_threads_counter,
+                knn_components,
+                reservation,
+            }),
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        schema: SchemaRef,
+        options: SpatialJoinOptions,
+        evaluator: Arc<dyn OperandEvaluator>,
+        refiner: Arc<dyn IndexQueryResultRefiner>,
+        refiner_reservation: ConcurrentReservation,
+        rtree: RTree<f32>,
+        indexed_batches: Vec<EvaluatedBatch>,
+        data_id_to_batch_pos: Vec<(i32, i32)>,
+        geom_idx_vec: Vec<usize>,
+        visited_build_side: Option<Mutex<Vec<BooleanBufferBuilder>>>,
+        probe_threads_counter: AtomicUsize,
+        knn_components: Option<KnnComponents>,
+        reservation: MemoryReservation,
+    ) -> Self {
+        Self {
+            inner: Arc::new(CPUSpatialIndexInner {
+                schema,
+                options,
+                evaluator,
+                refiner,
+                refiner_reservation,
+                rtree,
+                data_id_to_batch_pos,
+                indexed_batches,
+                geom_idx_vec,
+                visited_build_side,
+                probe_threads_counter,
+                knn_components,
+                reservation,
+            }),
+        }
+    }
+    /// Create a KNN geometry accessor for accessing geometries with caching
+    fn create_knn_accessor(&self) -> Result<SedonaKnnAdapter<'_>> {
+        let Some(knn_components) = self.inner.knn_components.as_ref() else {
+            return sedona_internal_err!("knn_components is not initialized when running KNN join");
+        };
+        Ok(SedonaKnnAdapter::new(
+            &self.inner.indexed_batches,
+            &self.inner.data_id_to_batch_pos,
+            knn_components,
+        ))
+    }
+    async fn refine_concurrently(
+        &self,
+        evaluated_batch: &Arc<EvaluatedBatch>,
+        row_idx: usize,
+        candidates: &[u32],
+        distance: Option<f64>,
+        refine_chunk_size: usize,
+    ) -> Result<(QueryResultMetrics, Vec<(i32, i32)>)> {
+        let mut join_set = JoinSet::new();
+        for (i, chunk) in candidates.chunks(refine_chunk_size).enumerate() {
+            let cloned_evaluated_batch = Arc::clone(evaluated_batch);
+            let chunk = chunk.to_vec();
+            let index_owned = self.clone();
+            join_set.spawn(async move {
+                let Some(probe_wkb) = cloned_evaluated_batch.wkb(row_idx) else {
+                    return (
+                        i,
+                        sedona_internal_err!(
+                            "Failed to get WKB for row {} in evaluated batch",
+                            row_idx
+                        ),
+                    );
+                };
+                let mut local_positions: Vec<(i32, i32)> = Vec::with_capacity(chunk.len());
+                let res = index_owned.refine(probe_wkb, &chunk, &distance, &mut local_positions);
+                (i, res.map(|r| (r, local_positions)))
+            });
+        }
+
+        // Collect the results in order
+        let mut refine_results = Vec::with_capacity(join_set.len());
+        refine_results.resize_with(join_set.len(), || None);
+        while let Some(res) = join_set.join_next().await {
+            let (chunk_idx, refine_res) =
+                res.map_err(|e| DataFusionError::External(Box::new(e)))?;
+            let (metrics, positions) = refine_res?;
+            refine_results[chunk_idx] = Some((metrics, positions));
+        }
+
+        let mut total_metrics = QueryResultMetrics {
+            count: 0,
+            candidate_count: 0,
+        };
+        let mut all_positions = Vec::with_capacity(candidates.len());
+        for res in refine_results {
+            let (metrics, positions) = res.expect("All chunks should be processed");
+            total_metrics.count += metrics.count;
+            total_metrics.candidate_count += metrics.candidate_count;
+            all_positions.extend(positions);
+        }
+
+        Ok((total_metrics, all_positions))
+    }
+
+    fn refine(
+        &self,
+        probe_wkb: &Wkb,
+        candidates: &[u32],
+        distance: &Option<f64>,
+        build_batch_positions: &mut Vec<(i32, i32)>,
+    ) -> Result<QueryResultMetrics> {
+        let candidate_count = candidates.len();
+
+        let mut index_query_results = Vec::with_capacity(candidate_count);
+        for data_idx in candidates {
+            let pos = self.inner.data_id_to_batch_pos[*data_idx as usize];
+            let (batch_idx, row_idx) = pos;
+            let indexed_batch = &self.inner.indexed_batches[batch_idx as usize];
+            let build_wkb = indexed_batch.wkb(row_idx as usize);
+            let Some(build_wkb) = build_wkb else {
+                continue;
+            };
+            let distance = self.inner.evaluator.resolve_distance(
+                indexed_batch.distance(),
+                row_idx as usize,
+                distance,
+            )?;
+            let geom_idx = self.inner.geom_idx_vec[*data_idx as usize];
+            index_query_results.push(IndexQueryResult {
+                wkb: build_wkb,
+                distance,
+                geom_idx,
+                position: pos,
+            });
+        }
+
+        if index_query_results.is_empty() {
+            return Ok(QueryResultMetrics {
+                count: 0,
+                candidate_count,
+            });
+        }
+
+        let results = self.inner.refiner.refine(probe_wkb, &index_query_results)?;
+        let num_results = results.len();
+        build_batch_positions.extend(results);
+
+        // Update refiner memory reservation
+        self.inner
+            .refiner_reservation
+            .resize(self.inner.refiner.mem_usage())?;
+
+        Ok(QueryResultMetrics {
+            count: num_results,
+            candidate_count,
+        })
+    }
+}
+
+#[async_trait]
+impl SpatialIndex for CPUSpatialIndex {
+    fn schema(&self) -> SchemaRef {
+        self.inner.schema.clone()
+    }
+
+    fn get_num_indexed_batches(&self) -> usize {
+        self.inner.indexed_batches.len()
+    }
+
+    fn get_indexed_batch(&self, batch_idx: usize) -> &RecordBatch {
+        &self.inner.indexed_batches[batch_idx].batch
+    }
+
+    #[allow(unused)]
+    fn query(
+        &self,
+        probe_wkb: &Wkb,
+        probe_rect: &Rect<f32>,
+        distance: &Option<f64>,
+        build_batch_positions: &mut Vec<(i32, i32)>,
+    ) -> Result<QueryResultMetrics> {
+        let min = probe_rect.min();
+        let max = probe_rect.max();
+        let mut candidates = self.inner.rtree.search(min.x, min.y, max.x, max.y);
+        if candidates.is_empty() {
+            return Ok(QueryResultMetrics {
+                count: 0,
+                candidate_count: 0,
+            });
+        }
+
+        // Sort and dedup candidates to avoid duplicate results when we index one geometry
+        // using several boxes.
+        candidates.sort_unstable();
+        candidates.dedup();
+
+        // Refine the candidates retrieved from the r-tree index by evaluating the actual spatial predicate
+        self.refine(probe_wkb, &candidates, distance, build_batch_positions)
+    }
+
+    fn query_knn(
+        &self,
+        probe_wkb: &Wkb,
+        k: u32,
+        use_spheroid: bool,
+        include_tie_breakers: bool,
+        build_batch_positions: &mut Vec<(i32, i32)>,
+    ) -> Result<QueryResultMetrics> {
+        if k == 0 {
+            return Ok(QueryResultMetrics {
+                count: 0,
+                candidate_count: 0,
+            });
+        }
+
+        // Check if index is empty
+        if self.inner.indexed_batches.is_empty() || self.inner.data_id_to_batch_pos.is_empty() {
+            return Ok(QueryResultMetrics {
+                count: 0,
+                candidate_count: 0,
+            });
+        }
+
+        // Convert probe WKB to geo::Geometry
+        let probe_geom = match item_to_geometry(probe_wkb) {
+            Ok(geom) => geom,
+            Err(_) => {
+                // Empty or unsupported geometries (e.g., POINT EMPTY) return empty results
+                return Ok(QueryResultMetrics {
+                    count: 0,
+                    candidate_count: 0,
+                });
+            }
+        };
+
+        // Select the appropriate distance metric
+        let distance_metric: &dyn DistanceMetric<f32> = {
+            let Some(knn_components) = self.inner.knn_components.as_ref() else {
+                return sedona_internal_err!(
+                    "knn_components is not initialized when running KNN join"
+                );
+            };
+            if use_spheroid {
+                &knn_components.haversine_metric
+            } else {
+                &knn_components.euclidean_metric
+            }
+        };
+
+        // Create geometry accessor for on-demand WKB decoding and caching
+        let geometry_accessor = self.create_knn_accessor()?;
+
+        // Use neighbors_geometry to find k nearest neighbors
+        let initial_results = self.inner.rtree.neighbors_geometry(
+            &probe_geom,
+            Some(k as usize),
+            None, // no max_distance filter
+            distance_metric,
+            &geometry_accessor,
+        );
+
+        if initial_results.is_empty() {
+            return Ok(QueryResultMetrics {
+                count: 0,
+                candidate_count: 0,
+            });
+        }
+
+        let mut final_results = initial_results;
+        let mut candidate_count = final_results.len();
+
+        // Handle tie-breakers if enabled
+        if include_tie_breakers && !final_results.is_empty() && k > 0 {
+            // Calculate distances for the initial k results to find the k-th distance
+            let mut distances_with_indices: Vec<(f64, u32)> = Vec::new();
+
+            for &result_idx in &final_results {
+                if (result_idx as usize) < self.inner.data_id_to_batch_pos.len() {
+                    if let Some(item_geom) = geometry_accessor.get_geometry(result_idx as usize) {
+                        let distance = distance_metric.distance_to_geometry(&probe_geom, item_geom);
+                        if let Some(distance_f64) = distance.to_f64() {
+                            distances_with_indices.push((distance_f64, result_idx));
+                        }
+                    }
+                }
+            }
+
+            // Sort by distance
+            distances_with_indices
+                .sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
+
+            // Find the k-th distance (if we have at least k results)
+            if distances_with_indices.len() >= k as usize {
+                let k_idx = (k as usize)
+                    .min(distances_with_indices.len())
+                    .saturating_sub(1);
+                let max_distance = distances_with_indices[k_idx].0;
+
+                // For tie-breakers, create spatial envelope around probe centroid and use rtree.search()
+
+                // Create envelope bounds by expanding the probe bounding box by max_distance
+                let Some(rect) = probe_geom.bounding_rect() else {
+                    // If bounding rectangle cannot be computed, return empty results
+                    return Ok(QueryResultMetrics {
+                        count: 0,
+                        candidate_count: 0,
+                    });
+                };
+
+                let min = rect.min();
+                let max = rect.max();
+                let (min_x, min_y, max_x, max_y) = f64_box_to_f32(min.x, min.y, max.x, max.y);
+                let mut distance_f32 = max_distance as f32;
+                if (distance_f32 as f64) < max_distance {
+                    distance_f32 = distance_f32.next_after(f32::INFINITY);
+                }
+                let (min_x, min_y, max_x, max_y) = (
+                    min_x - distance_f32,
+                    min_y - distance_f32,
+                    max_x + distance_f32,
+                    max_y + distance_f32,
+                );
+
+                // Use rtree.search() with envelope bounds (like the old code)
+                let expanded_results = self.inner.rtree.search(min_x, min_y, max_x, max_y);
+
+                candidate_count = expanded_results.len();
+
+                // Calculate distances for all results and find ties
+                let mut all_distances_with_indices: Vec<(f64, u32)> = Vec::new();
+
+                for &result_idx in &expanded_results {
+                    if (result_idx as usize) < self.inner.data_id_to_batch_pos.len() {
+                        if let Some(item_geom) = geometry_accessor.get_geometry(result_idx as usize)
+                        {
+                            let distance =
+                                distance_metric.distance_to_geometry(&probe_geom, item_geom);
+                            if let Some(distance_f64) = distance.to_f64() {
+                                all_distances_with_indices.push((distance_f64, result_idx));
+                            }
+                        }
+                    }
+                }
+
+                // Sort by distance
+                all_distances_with_indices
+                    .sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
+
+                // Include all results up to and including those with the same distance as the k-th result
+                const DISTANCE_TOLERANCE: f64 = 1e-9;
+                let mut tie_breaker_results: Vec<u32> = Vec::new();
+
+                for (i, &(distance, result_idx)) in all_distances_with_indices.iter().enumerate() {
+                    if i < k as usize {
+                        // Include the first k results
+                        tie_breaker_results.push(result_idx);
+                    } else if (distance - max_distance).abs() <= DISTANCE_TOLERANCE {
+                        // Include tie-breakers (same distance as k-th result)
+                        tie_breaker_results.push(result_idx);
+                    } else {
+                        // No more ties, stop
+                        break;
+                    }
+                }
+
+                final_results = tie_breaker_results;
+            }
+        } else {
+            // When tie-breakers are disabled, limit results to exactly k
+            if final_results.len() > k as usize {
+                final_results.truncate(k as usize);
+            }
+        }
+
+        // Convert results to build_batch_positions using existing data_id_to_batch_pos mapping
+        for &result_idx in &final_results {
+            if (result_idx as usize) < self.inner.data_id_to_batch_pos.len() {
+                build_batch_positions.push(self.inner.data_id_to_batch_pos[result_idx as usize]);
+            }
+        }
+
+        Ok(QueryResultMetrics {
+            count: final_results.len(),
+            candidate_count,
+        })
+    }
+
+    async fn query_batch(
+        &self,
+        evaluated_batch: &Arc<EvaluatedBatch>,
+        range: Range<usize>,
+        max_result_size: usize,
+        build_batch_positions: &mut Vec<(i32, i32)>,
+        probe_indices: &mut Vec<u32>,
+    ) -> Result<(QueryResultMetrics, usize)> {
+        if range.is_empty() {
+            return Ok((
+                QueryResultMetrics {
+                    count: 0,
+                    candidate_count: 0,
+                },
+                range.start,
+            ));
+        }
+
+        let rects = evaluated_batch.rects();
+        let dist = evaluated_batch.distance();
+        let mut total_candidates_count = 0;
+        let mut total_count = 0;
+        let mut current_row_idx = range.start;
+        for row_idx in range {
+            current_row_idx = row_idx;
+            let Some(probe_rect) = rects[row_idx] else {
+                continue;
+            };
+
+            let min = probe_rect.min();
+            let max = probe_rect.max();
+            let mut candidates = self.inner.rtree.search(min.x, min.y, max.x, max.y);
+            if candidates.is_empty() {
+                continue;
+            }
+
+            let Some(probe_wkb) = evaluated_batch.wkb(row_idx) else {
+                return sedona_internal_err!(
+                    "Failed to get WKB for row {} in evaluated batch",
+                    row_idx
+                );
+            };
+
+            // Sort and dedup candidates to avoid duplicate results when we index one geometry
+            // using several boxes.
+            candidates.sort_unstable();
+            candidates.dedup();
+
+            let distance = match dist {
+                Some(dist_array) => distance_value_at(dist_array, row_idx)?,
+                None => None,
+            };
+
+            // Refine the candidates retrieved from the r-tree index by evaluating the actual spatial predicate
+            let refine_chunk_size = self.inner.options.parallel_refinement_chunk_size;
+            if refine_chunk_size == 0 || candidates.len() < refine_chunk_size * 2 {
+                // For small candidate sets, use refine synchronously
+                let metrics =
+                    self.refine(probe_wkb, &candidates, &distance, build_batch_positions)?;
+                probe_indices.extend(std::iter::repeat_n(row_idx as u32, metrics.count));
+                total_count += metrics.count;
+                total_candidates_count += metrics.candidate_count;
+            } else {
+                // For large candidate sets, spawn several tasks to parallelize refinement
+                let (metrics, positions) = self
+                    .refine_concurrently(
+                        evaluated_batch,
+                        row_idx,
+                        &candidates,
+                        distance,
+                        refine_chunk_size,
+                    )
+                    .await?;
+                build_batch_positions.extend(positions);
+                probe_indices.extend(std::iter::repeat_n(row_idx as u32, metrics.count));
+                total_count += metrics.count;
+                total_candidates_count += metrics.candidate_count;
+            }
+
+            if total_count >= max_result_size {
+                break;
+            }
+        }
+
+        let end_idx = current_row_idx + 1;
+        Ok((
+            QueryResultMetrics {
+                count: total_count,
+                candidate_count: total_candidates_count,
+            },
+            end_idx,
+        ))
+    }
+
+    fn need_more_probe_stats(&self) -> bool {
+        self.inner.refiner.need_more_probe_stats()
+    }
+
+    fn merge_probe_stats(&self, stats: GeoStatistics) {
+        self.inner.refiner.merge_probe_stats(stats);
+    }
+
+    fn visited_build_side(&self) -> Option<&Mutex<Vec<BooleanBufferBuilder>>> {
+        self.inner.visited_build_side.as_ref()
+    }
+
+    fn report_probe_completed(&self) -> bool {
+        self.inner
+            .probe_threads_counter
+            .fetch_sub(1, Ordering::Relaxed)
+            == 1
+    }
+
+    fn get_refiner_mem_usage(&self) -> usize {
+        self.inner.refiner.mem_usage()
+    }
+
+    fn get_actual_execution_mode(&self) -> ExecutionMode {
+        self.inner.refiner.actual_execution_mode()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{
+        index::{CPUSpatialIndexBuilder as SpatialIndexBuilder, SpatialJoinBuildMetrics},
+        operand_evaluator::EvaluatedGeometryArray,
+        spatial_predicate::{KNNPredicate, RelationPredicate},
+    };
+
+    use super::*;
+    use crate::index::spatial_index::SpatialIndexRef;
+    use crate::spatial_predicate::SpatialRelationType;
+    use arrow_array::RecordBatch;
+    use arrow_schema::{DataType, Field};
+    use datafusion_common::JoinSide;
+    use datafusion_execution::memory_pool::GreedyMemoryPool;
+    use datafusion_expr::JoinType;
+    use datafusion_physical_expr::expressions::Column;
+    use geo_traits::Dimensions;
+    use sedona_common::option::{ExecutionMode, SpatialJoinOptions};
+    use sedona_geometry::wkb_factory::write_wkb_empty_point;
+    use sedona_schema::datatypes::WKB_GEOMETRY;
+    use sedona_testing::create::create_array;
+
+    #[test]
+    fn test_spatial_index_builder_empty() {
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+        let schema = Arc::new(arrow_schema::Schema::empty());
+        let spatial_predicate = SpatialPredicate::Relation(RelationPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            SpatialRelationType::Intersects,
+        ));
+
+        let builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        // Test finishing with empty data
+        let index = builder.finish().unwrap();
+        assert_eq!(index.schema(), schema);
+        assert_eq!(index.get_num_indexed_batches(), 0);
+    }
+
+    #[test]
+    fn test_spatial_index_builder_add_batch() {
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::Relation(RelationPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            SpatialRelationType::Intersects,
+        ));
+
+        // Create a simple test geometry batch
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+        let geom_batch = create_array(
+            &[
+                Some("POINT (0.25 0.25)"),
+                Some("POINT (10 10)"),
+                None,
+                Some("POINT (0.25 0.25)"),
+            ],
+            &WKB_GEOMETRY,
+        );
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+        assert_eq!(index.schema(), schema);
+        assert_eq!(index.get_num_indexed_batches(), 1);
+    }
+
+    #[test]
+    fn test_knn_query_execution_with_sample_data() {
+        // Create a spatial index with sample geometry data
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        // Create sample geometry data - points at known locations
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Create geometries at different distances from the query point (0, 0)
+        let geom_batch = create_array(
+            &[
+                Some("POINT (1 0)"), // Distance: 1.0
+                Some("POINT (0 2)"), // Distance: 2.0
+                Some("POINT (3 0)"), // Distance: 3.0
+                Some("POINT (0 4)"), // Distance: 4.0
+                Some("POINT (5 0)"), // Distance: 5.0
+                Some("POINT (2 2)"), // Distance: ~2.83
+                Some("POINT (1 1)"), // Distance: ~1.41
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Create a query geometry at origin (0, 0)
+        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test KNN query with k=3
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                query_wkb,
+                3,     // k=3
+                false, // use_spheroid=false
+                false, // include_tie_breakers=false
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Verify we got 3 results
+        assert_eq!(build_positions.len(), 3);
+        assert_eq!(result.count, 3);
+        assert!(result.candidate_count >= 3);
+
+        // Create a mapping of positions to verify correct ordering
+        // We expect the 3 closest points: (1,0), (1,1), (0,2)
+        let expected_closest_indices = vec![0, 6, 1]; // Based on our sample data ordering
+        let mut found_indices = Vec::new();
+
+        for (_batch_idx, row_idx) in &build_positions {
+            found_indices.push(*row_idx as usize);
+        }
+
+        // Sort to compare sets (order might vary due to implementation)
+        found_indices.sort();
+        let mut expected_sorted = expected_closest_indices;
+        expected_sorted.sort();
+
+        assert_eq!(found_indices, expected_sorted);
+    }
+
+    #[test]
+    fn test_knn_query_execution_with_different_k_values() {
+        // Create spatial index with more data points
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Create 10 points at regular intervals
+        let geom_batch = create_array(
+            &[
+                Some("POINT (1 0)"),  // 0: Distance 1
+                Some("POINT (2 0)"),  // 1: Distance 2
+                Some("POINT (3 0)"),  // 2: Distance 3
+                Some("POINT (4 0)"),  // 3: Distance 4
+                Some("POINT (5 0)"),  // 4: Distance 5
+                Some("POINT (6 0)"),  // 5: Distance 6
+                Some("POINT (7 0)"),  // 6: Distance 7
+                Some("POINT (8 0)"),  // 7: Distance 8
+                Some("POINT (9 0)"),  // 8: Distance 9
+                Some("POINT (10 0)"), // 9: Distance 10
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Query point at origin
+        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test different k values
+        for k in [1, 3, 5, 7, 10] {
+            let mut build_positions = Vec::new();
+            let result = index
+                .query_knn(query_wkb, k, false, false, &mut build_positions)
+                .unwrap();
+
+            // Verify we got exactly k results (or all available if k > total)
+            let expected_results = std::cmp::min(k as usize, 10);
+            assert_eq!(build_positions.len(), expected_results);
+            assert_eq!(result.count, expected_results);
+
+            // Verify the results are the k closest points
+            let mut row_indices: Vec<usize> = build_positions
+                .iter()
+                .map(|(_, row_idx)| *row_idx as usize)
+                .collect();
+            row_indices.sort();
+
+            let expected_indices: Vec<usize> = (0..expected_results).collect();
+            assert_eq!(row_indices, expected_indices);
+        }
+    }
+
+    #[test]
+    fn test_knn_query_execution_with_spheroid_distance() {
+        // Create spatial index
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            true,
+            JoinSide::Left,
+        ));
+
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Create points with geographic coordinates (longitude, latitude)
+        let geom_batch = create_array(
+            &[
+                Some("POINT (-74.0 40.7)"), // NYC area
+                Some("POINT (-73.9 40.7)"), // Slightly east
+                Some("POINT (-74.1 40.7)"), // Slightly west
+                Some("POINT (-74.0 40.8)"), // Slightly north
+                Some("POINT (-74.0 40.6)"), // Slightly south
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Query point at NYC
+        let query_geom = create_array(&[Some("POINT (-74.0 40.7)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test with planar distance (spheroid distance is not supported)
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                query_wkb,
+                3,     // k=3
+                false, // use_spheroid=false (only supported option)
+                false,
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Should find results with planar distance calculation
+        assert!(!build_positions.is_empty()); // At least the exact match
+        assert!(result.count >= 1);
+        assert!(result.candidate_count >= 1);
+
+        // Test that spheroid distance now works with Haversine metric
+        let mut build_positions_spheroid = Vec::new();
+        let result_spheroid = index.query_knn(
+            query_wkb,
+            3,    // k=3
+            true, // use_spheroid=true (now supported with Haversine)
+            false,
+            &mut build_positions_spheroid,
+        );
+
+        // Should succeed and return results
+        assert!(result_spheroid.is_ok());
+        let result_spheroid = result_spheroid.unwrap();
+        assert!(!build_positions_spheroid.is_empty());
+        assert!(result_spheroid.count >= 1);
+        assert!(result_spheroid.candidate_count >= 1);
+    }
+
+    #[test]
+    fn test_knn_query_execution_edge_cases() {
+        // Create spatial index
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Create sample data with some edge cases
+        let geom_batch = create_array(
+            &[
+                Some("POINT (1 1)"),
+                Some("POINT (2 2)"),
+                None, // NULL geometry
+                Some("POINT (3 3)"),
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test k=0 (should return no results)
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                query_wkb,
+                0, // k=0
+                false,
+                false,
+                &mut build_positions,
+            )
+            .unwrap();
+
+        assert_eq!(build_positions.len(), 0);
+        assert_eq!(result.count, 0);
+        assert_eq!(result.candidate_count, 0);
+
+        // Test k > available geometries
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                query_wkb,
+                10, // k=10, but only 3 valid geometries available
+                false,
+                false,
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Should return all available valid geometries (excluding NULL)
+        assert_eq!(build_positions.len(), 3);
+        assert_eq!(result.count, 3);
+    }
+
+    #[test]
+    fn test_knn_query_execution_empty_index() {
+        // Create empty spatial index
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+        let schema = Arc::new(arrow_schema::Schema::empty());
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        let builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Try to query empty index
+        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(query_wkb, 5, false, false, &mut build_positions)
+            .unwrap();
+
+        // Should return no results for empty index
+        assert_eq!(build_positions.len(), 0);
+        assert_eq!(result.count, 0);
+        assert_eq!(result.candidate_count, 0);
+    }
+
+    #[test]
+    fn test_knn_query_execution_with_tie_breakers() {
+        // Create a spatial index with sample geometry data
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            1, // probe_threads_count
+            memory_pool.clone(),
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Create points where we have more ties at the k-th distance
+        // Query point is at (0.0, 0.0)
+        // We'll create a scenario with k=2 where there are 3 points at the same distance
+        // This ensures the tie-breaker logic has work to do
+        let geom_batch = create_array(
+            &[
+                Some("POINT (1.0 0.0)"),  // Squared distance 1.0
+                Some("POINT (0.0 1.0)"),  // Squared distance 1.0 (tie!)
+                Some("POINT (-1.0 0.0)"), // Squared distance 1.0 (tie!)
+                Some("POINT (0.0 -1.0)"), // Squared distance 1.0 (tie!)
+                Some("POINT (2.0 0.0)"),  // Squared distance 4.0
+                Some("POINT (0.0 2.0)"),  // Squared distance 4.0
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Query point at the origin (0.0, 0.0)
+        let query_geom = create_array(&[Some("POINT (0.0 0.0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test without tie-breakers: should return exactly k=2 results
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                query_wkb,
+                2,     // k=2
+                false, // use_spheroid
+                false, // include_tie_breakers
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Should return exactly 2 results (the closest point + 1 of the tied points)
+        assert_eq!(result.count, 2);
+        assert_eq!(build_positions.len(), 2);
+
+        // Test with tie-breakers: should return k=2 plus all ties
+        let mut build_positions_with_ties = Vec::new();
+        let result_with_ties = index
+            .query_knn(
+                query_wkb,
+                2,     // k=2
+                false, // use_spheroid
+                true,  // include_tie_breakers
+                &mut build_positions_with_ties,
+            )
+            .unwrap();
+
+        // Should return more than 2 results because of ties
+        // We have 4 points at squared distance 1.0 (all tied for closest)
+        // With k=2 and tie-breakers:
+        // - Initial neighbors query returns 2 of the 4 tied points
+        // - Tie-breaker logic should find the other 2 tied points
+        // - Total should be 4 results (all points at distance 1.0)
+
+        // With 4 points all at the same distance and k=2:
+        // - Without tie-breakers: should return exactly 2
+        // - With tie-breakers: should return all 4 tied points
+        assert_eq!(
+            result.count, 2,
+            "Without tie-breakers should return exactly k=2"
+        );
+        assert_eq!(
+            result_with_ties.count, 4,
+            "With tie-breakers should return all 4 tied points"
+        );
+        assert_eq!(build_positions_with_ties.len(), 4);
+    }
+
+    #[test]
+    fn test_query_knn_with_geometry_distance() {
+        // Create a spatial index with sample geometry data
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        // Create sample geometry data - points at known locations
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Create geometries at different distances from the query point (0, 0)
+        let geom_batch = create_array(
+            &[
+                Some("POINT (1 0)"), // Distance: 1.0
+                Some("POINT (0 2)"), // Distance: 2.0
+                Some("POINT (3 0)"), // Distance: 3.0
+                Some("POINT (0 4)"), // Distance: 4.0
+                Some("POINT (5 0)"), // Distance: 5.0
+                Some("POINT (2 2)"), // Distance: ~2.83
+                Some("POINT (1 1)"), // Distance: ~1.41
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Create a query geometry at origin (0, 0)
+        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test the geometry-based query_knn method with k=3
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                query_wkb,
+                3,     // k=3
+                false, // use_spheroid=false
+                false, // include_tie_breakers=false
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Verify we got results (should be 3 or less)
+        assert!(!build_positions.is_empty());
+        assert!(build_positions.len() <= 3);
+        assert!(result.count > 0);
+        assert!(result.count <= 3);
+    }
+
+    #[test]
+    fn test_query_knn_with_mixed_geometries() {
+        // Create a spatial index with complex geometries where geometry-based
+        // distance should differ from centroid-based distance
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        // Create different geometry types
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Mix of points and linestrings
+        let geom_batch = create_array(
+            &[
+                Some("POINT (1 1)"),               // Simple point
+                Some("LINESTRING (2 0, 2 4)"),     // Vertical line - closest point should be (2, 1)
+                Some("LINESTRING (10 10, 10 20)"), // Far away line
+                Some("POINT (5 5)"),               // Far point
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Query point close to the linestring
+        let query_geom = create_array(&[Some("POINT (2.1 1.0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test the geometry-based KNN method with mixed geometry types
+        let mut build_positions = Vec::new();
+
+        let result = index
+            .query_knn(
+                query_wkb,
+                2,     // k=2
+                false, // use_spheroid=false
+                false, // include_tie_breakers=false
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Should return results
+        assert!(!build_positions.is_empty());
+
+        // Should work with mixed geometry types
+        assert!(result.count > 0);
+    }
+
+    #[test]
+    fn test_query_knn_with_tie_breakers_geometry_distance() {
+        // Create a spatial index with geometries that have identical distances for tie-breaker testing
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            4,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        // Create points where we have multiple points at the same distance from the query point
+        // Query point will be at (0, 0), and we'll have 4 points all at distance sqrt(2) ≈ 1.414
+        let geom_batch = create_array(
+            &[
+                Some("POINT (1.0 1.0)"),   // Distance: sqrt(2)
+                Some("POINT (1.0 -1.0)"),  // Distance: sqrt(2) - tied with above
+                Some("POINT (-1.0 1.0)"),  // Distance: sqrt(2) - tied with above
+                Some("POINT (-1.0 -1.0)"), // Distance: sqrt(2) - tied with above
+                Some("POINT (2.0 0.0)"),   // Distance: 2.0 - farther away
+                Some("POINT (0.0 2.0)"),   // Distance: 2.0 - farther away
+            ],
+            &WKB_GEOMETRY,
+        );
+
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Query point at the origin (0.0, 0.0)
+        let query_geom = create_array(&[Some("POINT (0.0 0.0)")], &WKB_GEOMETRY);
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // Test without tie-breakers: should return exactly k=2 results
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                query_wkb,
+                2,     // k=2
+                false, // use_spheroid
+                false, // include_tie_breakers=false
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Should return exactly 2 results
+        assert_eq!(result.count, 2);
+        assert_eq!(build_positions.len(), 2);
+
+        // Test with tie-breakers: should return all tied points
+        let mut build_positions_with_ties = Vec::new();
+        let result_with_ties = index
+            .query_knn(
+                query_wkb,
+                2,     // k=2
+                false, // use_spheroid
+                true,  // include_tie_breakers=true
+                &mut build_positions_with_ties,
+            )
+            .unwrap();
+
+        // Should return 4 results because of ties (all 4 points at distance sqrt(2))
+        assert!(result_with_ties.count == 4);
+
+        // Query using a box centered at the origin
+        let query_geom = create_array(
+            &[Some(
+                "POLYGON ((-0.5 -0.5, -0.5 0.5, 0.5 0.5, 0.5 -0.5, -0.5 -0.5))",
+            )],
+            &WKB_GEOMETRY,
+        );
+        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
+        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
+
+        // This query should return 4 points
+        let mut build_positions_with_ties = Vec::new();
+        let result_with_ties = index
+            .query_knn(
+                query_wkb,
+                2,     // k=2
+                false, // use_spheroid
+                true,  // include_tie_breakers=true
+                &mut build_positions_with_ties,
+            )
+            .unwrap();
+
+        // Should return 4 results because of ties (all 4 points at distance sqrt(2))
+        assert!(result_with_ties.count == 4);
+    }
+
+    #[test]
+    fn test_knn_query_with_empty_geometry() {
+        // Create a spatial index with sample geometry data like other tests
+        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
+        let options = SpatialJoinOptions {
+            execution_mode: ExecutionMode::PrepareBuild,
+            ..Default::default()
+        };
+        let metrics = SpatialJoinBuildMetrics::default();
+
+        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
+            Arc::new(Column::new("geom", 0)),
+            Arc::new(Column::new("geom", 1)),
+            5,
+            false,
+            JoinSide::Left,
+        ));
+
+        // Create geometry batch using the same pattern as other tests
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema.clone(),
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            1, // probe_threads_count
+            memory_pool.clone(),
+            metrics,
+        )
+        .unwrap();
+
+        let batch = RecordBatch::new_empty(schema.clone());
+
+        let geom_batch = create_array(
+            &[
+                Some("POINT (0 0)"),
+                Some("POINT (1 1)"),
+                Some("POINT (2 2)"),
+            ],
+            &WKB_GEOMETRY,
+        );
+        let indexed_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
+        };
+        builder.add_batch(indexed_batch).unwrap();
+
+        let index = builder.finish().unwrap();
+
+        // Create an empty point WKB
+        let mut empty_point_wkb = Vec::new();
+        write_wkb_empty_point(&mut empty_point_wkb, Dimensions::Xy).unwrap();
+
+        // Query with the empty point
+        let mut build_positions = Vec::new();
+        let result = index
+            .query_knn(
+                &wkb::reader::read_wkb(&empty_point_wkb).unwrap(),
+                2,     // k=2
+                false, // use_spheroid
+                false, // include_tie_breakers
+                &mut build_positions,
+            )
+            .unwrap();
+
+        // Should return empty results for empty geometry
+        assert_eq!(result.count, 0);
+        assert_eq!(result.candidate_count, 0);
+        assert!(build_positions.is_empty());
+    }
+
+    async fn setup_index_for_batch_test(
+        build_geoms: &[Option<&str>],
+        options: SpatialJoinOptions,
+    ) -> SpatialIndexRef {
+        let memory_pool = Arc::new(GreedyMemoryPool::new(100 * 1024 * 1024));
+        let metrics = SpatialJoinBuildMetrics::default();
+        let spatial_predicate = SpatialPredicate::Relation(RelationPredicate::new(
+            Arc::new(Column::new("left", 0)),
+            Arc::new(Column::new("right", 0)),
+            SpatialRelationType::Intersects,
+        ));
+        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
+            "geom",
+            DataType::Binary,
+            true,
+        )]));
+
+        let mut builder = SpatialIndexBuilder::new(
+            schema,
+            spatial_predicate,
+            options,
+            JoinType::Inner,
+            1,
+            memory_pool,
+            metrics,
+        )
+        .unwrap();
+
+        let geom_array = create_array(build_geoms, &WKB_GEOMETRY);
+        let batch = RecordBatch::try_new(
+            Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "geom",
+                DataType::Binary,
+                true,
+            )])),
+            vec![Arc::new(geom_array.clone())],
+        )
+        .unwrap();
+        let evaluated_batch = EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_array, &WKB_GEOMETRY).unwrap(),
+        };
+
+        builder.add_batch(evaluated_batch).unwrap();
+        builder.finish().unwrap()
+    }
+
+    fn create_probe_batch(probe_geoms: &[Option<&str>]) -> Arc<EvaluatedBatch> {
+        let geom_array = create_array(probe_geoms, &WKB_GEOMETRY);
+        let batch = RecordBatch::try_new(
+            Arc::new(arrow_schema::Schema::new(vec![Field::new(
+                "geom",
+                DataType::Binary,
+                true,
+            )])),
+            vec![Arc::new(geom_array.clone())],
+        )
+        .unwrap();
+        Arc::new(EvaluatedBatch {
+            batch,
+            geom_array: EvaluatedGeometryArray::try_new(geom_array, &WKB_GEOMETRY).unwrap(),
+        })
+    }
+
+    #[tokio::test]
+    async fn test_query_batch_empty_results() {
+        let build_geoms = &[Some("POINT (0 0)"), Some("POINT (1 1)")];
+        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
+
+        // Probe with geometries that don't intersect
+        let probe_geoms = &[Some("POINT (10 10)"), Some("POINT (20 20)")];
+        let probe_batch = create_probe_batch(probe_geoms);
+
+        let mut build_batch_positions = Vec::new();
+        let mut probe_indices = Vec::new();
+        let (metrics, next_idx) = index
+            .query_batch(
+                &probe_batch,
+                0..2,
+                usize::MAX,
+                &mut build_batch_positions,
+                &mut probe_indices,
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(metrics.count, 0);
+        assert_eq!(build_batch_positions.len(), 0);
+        assert_eq!(probe_indices.len(), 0);
+        assert_eq!(next_idx, 2);
+    }
+
+    #[tokio::test]
+    async fn test_query_batch_max_result_size() {
+        let build_geoms = &[
+            Some("POINT (0 0)"),
+            Some("POINT (0 0)"),
+            Some("POINT (0 0)"),
+        ];
+        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
+
+        // Probe with geometry that intersects all 3
+        let probe_geoms = &[Some("POINT (0 0)"), Some("POINT (0 0)")];
+        let probe_batch = create_probe_batch(probe_geoms);
+
+        // Case 1: Max result size is large enough
+        let mut build_batch_positions = Vec::new();
+        let mut probe_indices = Vec::new();
+        let (metrics, next_idx) = index
+            .query_batch(
+                &probe_batch,
+                0..2,
+                10,
+                &mut build_batch_positions,
+                &mut probe_indices,
+            )
+            .await
+            .unwrap();
+        assert_eq!(metrics.count, 6); // 2 probes * 3 matches
+        assert_eq!(next_idx, 2);
+        assert_eq!(probe_indices, vec![0, 0, 0, 1, 1, 1]);
+
+        // Case 2: Max result size is small (stops after first probe)
+        let mut build_batch_positions = Vec::new();
+        let mut probe_indices = Vec::new();
+        let (metrics, next_idx) = index
+            .query_batch(
+                &probe_batch,
+                0..2,
+                2, // Stop after 2 results
+                &mut build_batch_positions,
+                &mut probe_indices,
+            )
+            .await
+            .unwrap();
+
+        // It should process the first probe, find 3 matches.
+        // Since 3 >= 2, it should stop.
+        assert_eq!(metrics.count, 3);
+        assert_eq!(next_idx, 1); // Only processed 1 probe
+        assert_eq!(probe_indices, vec![0, 0, 0]);
+    }
+
+    #[tokio::test]
+    async fn test_query_batch_parallel_refinement() {
+        // Create enough build geometries to trigger parallel refinement
+        // We need candidates.len() >= chunk_size * 2
+        // Let's set chunk_size = 2, so we need >= 4 candidates.
+        let build_geoms = vec![Some("POINT (0 0)"); 10];
+        let options = SpatialJoinOptions {
+            parallel_refinement_chunk_size: 2,
+            ..Default::default()
+        };
+
+        let index = setup_index_for_batch_test(&build_geoms, options).await;
+
+        // Probe with a geometry that intersects all build geometries
+        let probe_geoms = &[Some("POLYGON ((-1 -1, 1 -1, 1 1, -1 1, -1 -1))")];
+        let probe_batch = create_probe_batch(probe_geoms);
+
+        let mut build_batch_positions = Vec::new();
+        let mut probe_indices = Vec::new();
+        let (metrics, next_idx) = index
+            .query_batch(
+                &probe_batch,
+                0..1,
+                usize::MAX,
+                &mut build_batch_positions,
+                &mut probe_indices,
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(metrics.count, 10);
+        assert_eq!(build_batch_positions.len(), 10);
+        assert_eq!(probe_indices, vec![0; 10]);
+        assert_eq!(next_idx, 1);
+    }
+
+    #[tokio::test]
+    async fn test_query_batch_empty_range() {
+        let build_geoms = &[Some("POINT (0 0)")];
+        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
+        let probe_geoms = &[Some("POINT (0 0)"), Some("POINT (0 0)")];
+        let probe_batch = create_probe_batch(probe_geoms);
+
+        let mut build_batch_positions = Vec::new();
+        let mut probe_indices = Vec::new();
+
+        // Query with empty range
+        for empty_ranges in [0..0, 1..1, 2..2] {
+            let (metrics, next_idx) = index
+                .query_batch(
+                    &probe_batch,
+                    empty_ranges.clone(),
+                    usize::MAX,
+                    &mut build_batch_positions,
+                    &mut probe_indices,
+                )
+                .await
+                .unwrap();
+
+            assert_eq!(metrics.count, 0);
+            assert_eq!(next_idx, empty_ranges.end);
+        }
+    }
+
+    #[tokio::test]
+    async fn test_query_batch_range_offset() {
+        let build_geoms = &[Some("POINT (0 0)"), Some("POINT (1 1)")];
+        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
+
+        // Probe with 3 geometries:
+        // 0: POINT (0 0) - matches build[0] (should be skipped)
+        // 1: POINT (0 0) - matches build[0]
+        // 2: POINT (1 1) - matches build[1]
+        let probe_geoms = &[
+            Some("POINT (0 0)"),
+            Some("POINT (0 0)"),
+            Some("POINT (1 1)"),
+        ];
+        let probe_batch = create_probe_batch(probe_geoms);
+
+        let mut build_batch_positions = Vec::new();
+        let mut probe_indices = Vec::new();
+
+        // Query with range 1..3 (skipping the first probe)
+        let (metrics, next_idx) = index
+            .query_batch(
+                &probe_batch,
+                1..3,
+                usize::MAX,
+                &mut build_batch_positions,
+                &mut probe_indices,
+            )
+            .await
+            .unwrap();
+
+        assert_eq!(metrics.count, 2);
+        assert_eq!(next_idx, 3);
+
+        // probe_indices should contain indices relative to the batch start (1 and 2)
+        assert_eq!(probe_indices, vec![1, 2]);
+
+        // build_batch_positions should contain matches for probe 1 and probe 2
+        // probe 1 matches build 0 (0, 0)
+        // probe 2 matches build 1 (0, 1)
+        // Note: build_batch_positions contains (batch_idx, row_idx)
+        // Since we have 1 batch, batch_idx is 0.
+        assert_eq!(build_batch_positions, vec![(0, 0), (0, 1)]);
+    }
+
+    #[tokio::test]
+    async fn test_query_batch_zero_parallel_refinement_chunk_size() {
+        let build_geoms = &[
+            Some("POINT (0 0)"),
+            Some("POINT (0 0)"),
+            Some("POINT (0 0)"),
+        ];
+        let options = SpatialJoinOptions {
+            // force synchronous refinement
+            parallel_refinement_chunk_size: 0,
+            ..Default::default()
+        };
+
+        let index = setup_index_for_batch_test(build_geoms, options).await;
+        let probe_geoms = &[Some("POINT (0 0)")];
+        let probe_batch = create_probe_batch(probe_geoms);
+
+        let mut build_batch_positions = Vec::new();
+        let mut probe_indices = Vec::new();
+
+        let result = index
+            .query_batch(
+                &probe_batch,
+                0..1,
+                10,
+                &mut build_batch_positions,
+                &mut probe_indices,
+            )
+            .await;
+
+        assert!(result.is_ok());
+        let (metrics, _) = result.unwrap();
+        assert_eq!(metrics.count, 3);
+    }
+}
diff --git a/rust/sedona-spatial-join/src/index/spatial_index_builder.rs b/rust/sedona-spatial-join/src/index/cpu_spatial_index_builder.rs
similarity index 88%
rename from rust/sedona-spatial-join/src/index/spatial_index_builder.rs
rename to rust/sedona-spatial-join/src/index/cpu_spatial_index_builder.rs
index 49e0d8c69..86d933679 100644
--- a/rust/sedona-spatial-join/src/index/spatial_index_builder.rs
+++ b/rust/sedona-spatial-join/src/index/cpu_spatial_index_builder.rs
@@ -17,7 +17,6 @@
 
 use arrow::array::BooleanBufferBuilder;
 use arrow_schema::SchemaRef;
-use datafusion_physical_plan::metrics::{self, ExecutionPlanMetricsSet, MetricBuilder};
 use sedona_common::SpatialJoinOptions;
 use sedona_expr::statistics::GeoStatistics;
 
@@ -29,9 +28,11 @@ use geo_index::rtree::{sort::HilbertSort, RTree, RTreeBuilder};
 use parking_lot::Mutex;
 use std::sync::{atomic::AtomicUsize, Arc};
 
+use crate::index::cpu_spatial_index::CPUSpatialIndex;
+use crate::index::spatial_index::{SpatialIndexRef, SpatialJoinBuildMetrics};
 use crate::{
     evaluated_batch::EvaluatedBatch,
-    index::{knn_adapter::KnnComponents, spatial_index::SpatialIndex, BuildPartition},
+    index::{knn_adapter::KnnComponents, BuildPartition},
     operand_evaluator::create_operand_evaluator,
     refine::create_refiner,
     spatial_predicate::SpatialPredicate,
@@ -59,7 +60,7 @@ const REFINER_RESERVATION_PREALLOC_SIZE: usize = 10 * 1024 * 1024; // 10MB
 /// 2. Building the spatial R-tree index
 /// 3. Setting up memory tracking and visited bitmaps
 /// 4. Configuring prepared geometries based on execution mode
-pub struct SpatialIndexBuilder {
+pub struct CPUSpatialIndexBuilder {
     schema: SchemaRef,
     spatial_predicate: SpatialPredicate,
     options: SpatialJoinOptions,
@@ -79,25 +80,7 @@ pub struct SpatialIndexBuilder {
     memory_pool: Arc<dyn MemoryPool>,
 }
 
-/// Metrics for the build phase of the spatial join.
-#[derive(Clone, Debug, Default)]
-pub struct SpatialJoinBuildMetrics {
-    /// Total time for collecting build-side of join
-    pub(crate) build_time: metrics::Time,
-    /// Memory used by the spatial-index in bytes
-    pub(crate) build_mem_used: metrics::Gauge,
-}
-
-impl SpatialJoinBuildMetrics {
-    pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
-        Self {
-            build_time: MetricBuilder::new(metrics).subset_time("build_time", partition),
-            build_mem_used: MetricBuilder::new(metrics).gauge("build_mem_used", partition),
-        }
-    }
-}
-
-impl SpatialIndexBuilder {
+impl CPUSpatialIndexBuilder {
     /// Create a new builder with the given configuration.
     pub fn new(
         schema: SchemaRef,
@@ -228,16 +211,16 @@ impl SpatialIndexBuilder {
     }
 
     /// Finish building and return the completed SpatialIndex.
-    pub fn finish(mut self) -> Result<SpatialIndex> {
+    pub fn finish(mut self) -> Result<SpatialIndexRef> {
         if self.indexed_batches.is_empty() {
-            return Ok(SpatialIndex::empty(
+            return Ok(Arc::new(CPUSpatialIndex::empty(
                 self.spatial_predicate,
                 self.schema,
                 self.options,
                 AtomicUsize::new(self.probe_threads_count),
                 self.reservation,
                 self.memory_pool.clone(),
-            ));
+            )));
         }
 
         let evaluator = create_operand_evaluator(&self.spatial_predicate, self.options.clone());
@@ -272,21 +255,21 @@ impl SpatialIndexBuilder {
         .then(|| KnnComponents::new(cache_size, &self.indexed_batches, self.memory_pool.clone()))
         .transpose()?;
 
-        Ok(SpatialIndex {
-            schema: self.schema,
-            options: self.options,
+        Ok(Arc::new(CPUSpatialIndex::new(
+            self.schema,
+            self.options,
             evaluator,
             refiner,
             refiner_reservation,
             rtree,
-            data_id_to_batch_pos: batch_pos_vec,
-            indexed_batches: self.indexed_batches,
+            self.indexed_batches,
+            batch_pos_vec,
             geom_idx_vec,
             visited_build_side,
-            probe_threads_counter: AtomicUsize::new(self.probe_threads_count),
+            AtomicUsize::new(self.probe_threads_count),
             knn_components,
-            reservation: self.reservation,
-        })
+            self.reservation,
+        )))
     }
 
     pub async fn add_partitions(&mut self, partitions: Vec<BuildPartition>) -> Result<()> {
diff --git a/rust/sedona-spatial-join/src/index/gpu_spatial_index.rs b/rust/sedona-spatial-join/src/index/gpu_spatial_index.rs
new file mode 100644
index 000000000..dbe1628d6
--- /dev/null
+++ b/rust/sedona-spatial-join/src/index/gpu_spatial_index.rs
@@ -0,0 +1,298 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::evaluated_batch::EvaluatedBatch;
+use crate::index::spatial_index::SpatialIndex;
+use crate::index::QueryResultMetrics;
+use crate::operand_evaluator::OperandEvaluator;
+use crate::spatial_predicate::SpatialRelationType;
+use crate::{operand_evaluator::create_operand_evaluator, spatial_predicate::SpatialPredicate};
+use arrow::array::BooleanBufferBuilder;
+use arrow_array::{ArrayRef, RecordBatch};
+use arrow_schema::SchemaRef;
+use async_trait::async_trait;
+use datafusion_common::{DataFusionError, Result};
+use datafusion_execution::memory_pool::MemoryReservation;
+use geo_types::{coord, Rect};
+use parking_lot::Mutex;
+use sedona_common::{ExecutionMode, SpatialJoinOptions};
+use sedona_expr::statistics::GeoStatistics;
+use sedona_libgpuspatial::{GpuSpatial, GpuSpatialRelationPredicate};
+use std::ops::Range;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
+use wkb::reader::Wkb;
+
+pub struct GPUSpatialIndex {
+    pub(crate) schema: SchemaRef,
+    pub(crate) _options: SpatialJoinOptions,
+    /// The spatial predicate evaluator for the spatial predicate.
+    #[allow(dead_code)] // reserved for GPU-based distance evaluation
+    pub(crate) evaluator: Arc<dyn OperandEvaluator>,
+    /// GPU spatial object for performing GPU-accelerated spatial queries
+    pub(crate) gpu_spatial: Arc<GpuSpatial>,
+    pub(crate) spatial_predicate: SpatialPredicate,
+    /// Indexed batches containing evaluated geometry arrays. It contains the original record
+    /// batches and geometry arrays obtained by evaluating the geometry expression on the build side.
+    pub(crate) indexed_batches: Vec<EvaluatedBatch>,
+    /// An array for translating data index to geometry batch index and row index
+    pub(crate) data_id_to_batch_pos: Vec<(i32, i32)>,
+    /// Shared bitmap builders for visited left indices, one per batch
+    pub(crate) visited_build_side: Option<Mutex<Vec<BooleanBufferBuilder>>>,
+    /// Counter of running probe-threads, potentially able to update `bitmap`.
+    /// Each time a probe thread finished probing the index, it will decrement the counter.
+    /// The last finished probe thread will produce the extra output batches for unmatched
+    /// build side when running left-outer joins. See also [`report_probe_completed`].
+    pub(crate) probe_threads_counter: AtomicUsize,
+    /// Memory reservation for tracking the memory usage of the spatial index
+    /// Cleared on `SpatialIndex` drop
+    #[expect(dead_code)]
+    pub(crate) reservation: MemoryReservation,
+}
+impl GPUSpatialIndex {
+    pub fn empty(
+        spatial_predicate: SpatialPredicate,
+        schema: SchemaRef,
+        options: SpatialJoinOptions,
+        probe_threads_counter: AtomicUsize,
+        reservation: MemoryReservation,
+    ) -> Result<Self> {
+        let evaluator = create_operand_evaluator(&spatial_predicate, options.clone());
+
+        Ok(Self {
+            schema,
+            _options: options,
+            evaluator,
+            spatial_predicate,
+            gpu_spatial: Arc::new(
+                GpuSpatial::new().map_err(|e| DataFusionError::Execution(e.to_string()))?,
+            ),
+            indexed_batches: vec![],
+            data_id_to_batch_pos: vec![],
+            visited_build_side: None,
+            probe_threads_counter,
+            reservation,
+        })
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        spatial_predicate: SpatialPredicate,
+        schema: SchemaRef,
+        options: SpatialJoinOptions,
+        evaluator: Arc<dyn OperandEvaluator>,
+        gpu_spatial: Arc<GpuSpatial>,
+        indexed_batches: Vec<EvaluatedBatch>,
+        data_id_to_batch_pos: Vec<(i32, i32)>,
+        visited_build_side: Option<Mutex<Vec<BooleanBufferBuilder>>>,
+        probe_threads_counter: AtomicUsize,
+        reservation: MemoryReservation,
+    ) -> Result<Self> {
+        Ok(Self {
+            schema,
+            _options: options,
+            evaluator,
+            spatial_predicate,
+            gpu_spatial,
+            indexed_batches,
+            data_id_to_batch_pos,
+            visited_build_side,
+            probe_threads_counter,
+            reservation,
+        })
+    }
+
+    fn refine_loaded(
+        &self,
+        probe_geoms: &ArrayRef,
+        predicate: &SpatialPredicate,
+        build_indices: &mut Vec<u32>,
+        probe_indices: &mut Vec<u32>,
+    ) -> Result<()> {
+        match predicate {
+            SpatialPredicate::Relation(rel_p) => {
+                self.gpu_spatial
+                    .refine_loaded(
+                        probe_geoms,
+                        Self::convert_relation_type(&rel_p.relation_type)?,
+                        build_indices,
+                        probe_indices,
+                    )
+                    .map_err(|e| {
+                        DataFusionError::Execution(format!(
+                            "GPU spatial refinement failed: {:?}",
+                            e
+                        ))
+                    })?;
+                Ok(())
+            }
+            _ => Err(DataFusionError::NotImplemented(
+                "Only Relation predicate is supported for GPU spatial query".to_string(),
+            )),
+        }
+    }
+    // Translate Sedona SpatialRelationType to GpuSpatialRelationPredicate
+    fn convert_relation_type(t: &SpatialRelationType) -> Result<GpuSpatialRelationPredicate> {
+        match t {
+            SpatialRelationType::Equals => Ok(GpuSpatialRelationPredicate::Equals),
+            SpatialRelationType::Touches => Ok(GpuSpatialRelationPredicate::Touches),
+            SpatialRelationType::Contains => Ok(GpuSpatialRelationPredicate::Contains),
+            SpatialRelationType::Covers => Ok(GpuSpatialRelationPredicate::Covers),
+            SpatialRelationType::Intersects => Ok(GpuSpatialRelationPredicate::Intersects),
+            SpatialRelationType::Within => Ok(GpuSpatialRelationPredicate::Within),
+            SpatialRelationType::CoveredBy => Ok(GpuSpatialRelationPredicate::CoveredBy),
+            _ => {
+                // This should not happen as we check for supported predicates earlier
+                Err(DataFusionError::Execution(format!(
+                    "Unsupported spatial relation type for GPU: {:?}",
+                    t
+                )))
+            }
+        }
+    }
+}
+
+#[async_trait]
+impl SpatialIndex for GPUSpatialIndex {
+    fn schema(&self) -> SchemaRef {
+        self.schema.clone()
+    }
+    fn get_num_indexed_batches(&self) -> usize {
+        self.indexed_batches.len()
+    }
+    fn get_indexed_batch(&self, batch_idx: usize) -> &RecordBatch {
+        &self.indexed_batches[batch_idx].batch
+    }
+    async fn query_batch(
+        &self,
+        evaluated_batch: &Arc<EvaluatedBatch>,
+        range: Range<usize>,
+        _max_result_size: usize,
+        build_batch_positions: &mut Vec<(i32, i32)>,
+        probe_indices: &mut Vec<u32>,
+    ) -> Result<(QueryResultMetrics, usize)> {
+        if range.is_empty() {
+            return Ok((
+                QueryResultMetrics {
+                    count: 0,
+                    candidate_count: 0,
+                },
+                range.start,
+            ));
+        }
+        let gs = &self.gpu_spatial.as_ref();
+
+        let empty_rect = Rect::new(
+            coord!(x: f32::NAN, y: f32::NAN),
+            coord!(x: f32::NAN, y: f32::NAN),
+        );
+        let rects: Vec<_> = range
+            .clone()
+            .map(|row_idx| evaluated_batch.geom_array.rects[row_idx].unwrap_or(empty_rect))
+            .collect();
+
+        let (mut gpu_build_indices, mut gpu_probe_indices) =
+            gs.probe(rects.as_ref()).map_err(|e| {
+                DataFusionError::Execution(format!("GPU spatial query failed: {:?}", e))
+            })?;
+
+        assert_eq!(gpu_build_indices.len(), gpu_probe_indices.len());
+
+        let candidate_count = gpu_build_indices.len();
+
+        self.refine_loaded(
+            &evaluated_batch.geom_array.geometry_array,
+            &self.spatial_predicate,
+            &mut gpu_build_indices,
+            &mut gpu_probe_indices,
+        )?;
+
+        assert_eq!(gpu_build_indices.len(), gpu_probe_indices.len());
+
+        let total_count = gpu_build_indices.len();
+
+        for (build_idx, probe_idx) in gpu_build_indices.iter().zip(gpu_probe_indices.iter()) {
+            let data_id = *build_idx as usize;
+            let (batch_idx, row_idx) = self.data_id_to_batch_pos[data_id];
+            build_batch_positions.push((batch_idx, row_idx));
+            probe_indices.push(range.start as u32 + probe_idx);
+        }
+        Ok((
+            QueryResultMetrics {
+                count: total_count,
+                candidate_count,
+            },
+            range.end,
+        ))
+    }
+    fn need_more_probe_stats(&self) -> bool {
+        false
+    }
+
+    fn merge_probe_stats(&self, stats: GeoStatistics) {
+        let _ = stats;
+    }
+
+    fn visited_build_side(&self) -> Option<&Mutex<Vec<BooleanBufferBuilder>>> {
+        self.visited_build_side.as_ref()
+    }
+
+    fn report_probe_completed(&self) -> bool {
+        self.probe_threads_counter.fetch_sub(1, Ordering::Relaxed) == 1
+    }
+
+    fn get_refiner_mem_usage(&self) -> usize {
+        0
+    }
+
+    fn get_actual_execution_mode(&self) -> ExecutionMode {
+        ExecutionMode::PrepareBuild // GPU-based spatial index is always on PrepareBuild mode
+    }
+    #[allow(unused)]
+    fn query(
+        &self,
+        probe_wkb: &Wkb,
+        probe_rect: &Rect<f32>,
+        distance: &Option<f64>,
+        build_batch_positions: &mut Vec<(i32, i32)>,
+    ) -> Result<QueryResultMetrics> {
+        let _ = (probe_wkb, probe_rect, distance, build_batch_positions);
+        Err(DataFusionError::NotImplemented(
+            "Serial query is not implemented for GPU spatial index".to_string(),
+        ))
+    }
+
+    fn query_knn(
+        &self,
+        probe_wkb: &Wkb,
+        k: u32,
+        use_spheroid: bool,
+        include_tie_breakers: bool,
+        build_batch_positions: &mut Vec<(i32, i32)>,
+    ) -> Result<QueryResultMetrics> {
+        let _ = (
+            probe_wkb,
+            k,
+            use_spheroid,
+            include_tie_breakers,
+            build_batch_positions,
+        );
+        Err(DataFusionError::NotImplemented(
+            "KNN query is not implemented for GPU spatial index".to_string(),
+        ))
+    }
+}
diff --git a/rust/sedona-spatial-join/src/index/gpu_spatial_index_builder.rs b/rust/sedona-spatial-join/src/index/gpu_spatial_index_builder.rs
new file mode 100644
index 000000000..af36d8377
--- /dev/null
+++ b/rust/sedona-spatial-join/src/index/gpu_spatial_index_builder.rs
@@ -0,0 +1,261 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::index::gpu_spatial_index::GPUSpatialIndex;
+use crate::index::spatial_index::{SpatialIndexRef, SpatialJoinBuildMetrics};
+use crate::operand_evaluator::EvaluatedGeometryArray;
+use crate::utils::join_utils::need_produce_result_in_final;
+use crate::{
+    evaluated_batch::EvaluatedBatch, index::BuildPartition,
+    operand_evaluator::create_operand_evaluator, spatial_predicate::SpatialPredicate,
+};
+use arrow::array::BooleanBufferBuilder;
+use arrow::compute::concat;
+use arrow_array::RecordBatch;
+use arrow_schema::SchemaRef;
+use datafusion_common::Result;
+use datafusion_common::{DataFusionError, JoinType};
+use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation};
+use futures::StreamExt;
+use geo_types::{coord, Rect};
+use parking_lot::Mutex;
+use sedona_common::SpatialJoinOptions;
+use sedona_libgpuspatial::{GpuSpatial, GpuSpatialOptions};
+use std::sync::atomic::AtomicUsize;
+use std::sync::Arc;
+
+pub struct GPUSpatialIndexBuilder {
+    schema: SchemaRef,
+    spatial_predicate: SpatialPredicate,
+    options: SpatialJoinOptions,
+    join_type: JoinType,
+    probe_threads_count: usize,
+    metrics: SpatialJoinBuildMetrics,
+    /// Batches to be indexed
+    indexed_batches: Vec<EvaluatedBatch>,
+    /// Memory reservation for tracking the memory usage of the spatial index
+    reservation: MemoryReservation,
+}
+
+impl GPUSpatialIndexBuilder {
+    pub fn new(
+        schema: SchemaRef,
+        spatial_predicate: SpatialPredicate,
+        options: SpatialJoinOptions,
+        join_type: JoinType,
+        probe_threads_count: usize,
+        memory_pool: Arc<dyn MemoryPool>,
+        metrics: SpatialJoinBuildMetrics,
+    ) -> Self {
+        let consumer = MemoryConsumer::new("SpatialJoinIndex");
+        let reservation = consumer.register(&memory_pool);
+
+        Self {
+            schema,
+            spatial_predicate,
+            options,
+            join_type,
+            probe_threads_count,
+            metrics,
+            indexed_batches: vec![],
+            reservation,
+        }
+    }
+
+    /// Build visited bitmaps for tracking left-side indices in outer joins.
+    fn build_visited_bitmaps(&mut self) -> Result<Option<Mutex<Vec<BooleanBufferBuilder>>>> {
+        if !need_produce_result_in_final(self.join_type) {
+            return Ok(None);
+        }
+
+        let mut bitmaps = Vec::with_capacity(self.indexed_batches.len());
+        let mut total_buffer_size = 0;
+
+        for batch in &self.indexed_batches {
+            let batch_rows = batch.batch.num_rows();
+            let buffer_size = batch_rows.div_ceil(8);
+            total_buffer_size += buffer_size;
+
+            let mut bitmap = BooleanBufferBuilder::new(batch_rows);
+            bitmap.append_n(batch_rows, false);
+            bitmaps.push(bitmap);
+        }
+
+        self.reservation.try_grow(total_buffer_size)?;
+        self.metrics.build_mem_used.add(total_buffer_size);
+
+        Ok(Some(Mutex::new(bitmaps)))
+    }
+
+    /// Finish building and return the completed SpatialIndex.
+    pub fn finish(mut self) -> Result<SpatialIndexRef> {
+        if self.indexed_batches.is_empty() {
+            return Ok(Arc::new(GPUSpatialIndex::empty(
+                self.spatial_predicate,
+                self.schema,
+                self.options,
+                AtomicUsize::new(self.probe_threads_count),
+                self.reservation,
+            )?));
+        }
+        let build_timer = self.metrics.build_time.timer();
+
+        let gs_options = GpuSpatialOptions {
+            concurrency: self.probe_threads_count as u32,
+            device_id: self.options.gpu.device_id as i32,
+            compress_bvh: self.options.gpu.compress_bvh,
+            pipeline_batches: self.options.gpu.pipeline_batches as u32,
+            cuda_init_memory_pool_ratio: self.options.gpu.init_memory_pool_percentage as f32
+                / 100.0, // convert percentage to ratio
+        };
+
+        let mut gs = GpuSpatial::new()
+            .and_then(|mut gs| {
+                gs.init(gs_options)?;
+                Ok(gs)
+            })
+            .map_err(|e| {
+                DataFusionError::Execution(format!("Failed to initialize GPU context {e:?}"))
+            })?;
+
+        // Concat indexed batches into a single batch to reduce build time
+        if self.options.gpu.concat_build {
+            let all_record_batches: Vec<&RecordBatch> = self
+                .indexed_batches
+                .iter()
+                .map(|batch| &batch.batch)
+                .collect();
+            let schema = all_record_batches[0].schema();
+            let batch =
+                arrow::compute::concat_batches(&schema, all_record_batches).map_err(|e| {
+                    DataFusionError::Execution(format!("Failed to concatenate left batches: {}", e))
+                })?;
+
+            let references: Vec<&dyn arrow::array::Array> = self
+                .indexed_batches
+                .iter()
+                .map(|batch| batch.geom_array.geometry_array.as_ref())
+                .collect();
+
+            let concat_array = concat(&references)?;
+            let rects = self
+                .indexed_batches
+                .iter()
+                .flat_map(|batch| batch.geom_array.rects.iter().cloned())
+                .collect();
+            let eval_batch = EvaluatedBatch {
+                batch,
+                geom_array: EvaluatedGeometryArray {
+                    geometry_array: Arc::new(concat_array),
+                    rects,
+                    distance: None,
+                    wkbs: vec![],
+                },
+            };
+            self.indexed_batches.clear();
+            self.indexed_batches.push(eval_batch);
+        }
+
+        let mut data_id_to_batch_pos: Vec<(i32, i32)> = Vec::with_capacity(
+            self.indexed_batches
+                .iter()
+                .map(|x| x.batch.num_rows())
+                .sum(),
+        );
+        let empty_rect = Rect::new(
+            coord!(x: f32::NAN, y: f32::NAN),
+            coord!(x: f32::NAN, y: f32::NAN),
+        );
+        for (batch_idx, batch) in self.indexed_batches.iter().enumerate() {
+            let rects = batch.rects();
+            let mut native_rects = Vec::new();
+            for (idx, rect_opt) in rects.iter().enumerate() {
+                if let Some(rect) = rect_opt {
+                    native_rects.push(*rect);
+                } else {
+                    native_rects.push(empty_rect);
+                }
+                data_id_to_batch_pos.push((batch_idx as i32, idx as i32));
+            }
+            // Add rectangles from build side to the spatial index
+            gs.index_push_build(&native_rects).map_err(|e| {
+                DataFusionError::Execution(format!(
+                    "Failed to push rectangles to GPU spatial index {e:?}"
+                ))
+            })?;
+            gs.refiner_push_build(&batch.geom_array.geometry_array)
+                .map_err(|e| {
+                    DataFusionError::Execution(format!(
+                        "Failed to add geometries to GPU refiner {e:?}"
+                    ))
+                })?;
+        }
+
+        gs.index_finish_building().map_err(|e| {
+            DataFusionError::Execution(format!("Failed to build spatial index on GPU {e:?}"))
+        })?;
+        gs.refiner_finish_building().map_err(|e| {
+            DataFusionError::Execution(format!("Failed to build spatial refiner on GPU {e:?}"))
+        })?;
+        build_timer.done();
+        let visited_build_side = self.build_visited_bitmaps()?;
+        let evaluator = create_operand_evaluator(&self.spatial_predicate, self.options.clone());
+        // Build index for rectangle queries
+        Ok(Arc::new(GPUSpatialIndex::new(
+            self.spatial_predicate,
+            self.schema,
+            self.options,
+            evaluator,
+            Arc::new(gs),
+            self.indexed_batches,
+            data_id_to_batch_pos,
+            visited_build_side,
+            AtomicUsize::new(self.probe_threads_count),
+            self.reservation,
+        )?))
+    }
+
+    /// Add a geometry batch to be indexed.
+    ///
+    /// This method accumulates geometry batches that will be used to build the spatial index.
+    /// Each batch contains processed geometry data along with memory usage information.
+    pub fn add_batch(&mut self, indexed_batch: EvaluatedBatch) -> Result<()> {
+        let in_mem_size = indexed_batch.in_mem_size()?;
+        self.indexed_batches.push(indexed_batch);
+        self.reservation.grow(in_mem_size);
+        self.metrics.build_mem_used.add(in_mem_size);
+        Ok(())
+    }
+
+    pub async fn add_partition(&mut self, mut partition: BuildPartition) -> Result<()> {
+        let mut stream = partition.build_side_batch_stream;
+        while let Some(batch) = stream.next().await {
+            let indexed_batch = batch?;
+            self.add_batch(indexed_batch)?;
+        }
+        let mem_bytes = partition.reservation.free();
+        self.reservation.try_grow(mem_bytes)?;
+        Ok(())
+    }
+
+    pub async fn add_partitions(&mut self, partitions: Vec<BuildPartition>) -> Result<()> {
+        for partition in partitions {
+            self.add_partition(partition).await?;
+        }
+        Ok(())
+    }
+}
diff --git a/rust/sedona-spatial-join/src/index/spatial_index.rs b/rust/sedona-spatial-join/src/index/spatial_index.rs
index e5e69dd88..deab1839f 100644
--- a/rust/sedona-spatial-join/src/index/spatial_index.rs
+++ b/rust/sedona-spatial-join/src/index/spatial_index.rs
@@ -15,157 +15,50 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use std::{
-    ops::Range,
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc,
-    },
-};
-
 use arrow_array::RecordBatch;
 use arrow_schema::SchemaRef;
-use datafusion_common::{DataFusionError, Result};
-use datafusion_common_runtime::JoinSet;
-use datafusion_execution::memory_pool::{MemoryPool, MemoryReservation};
-use float_next_after::NextAfter;
-use geo::BoundingRect;
-use geo_index::rtree::{
-    distance::{DistanceMetric, GeometryAccessor},
-    util::f64_box_to_f32,
-};
-use geo_index::rtree::{sort::HilbertSort, RTree, RTreeBuilder, RTreeIndex};
-use geo_index::IndexableNum;
+use async_trait::async_trait;
+use datafusion_common::Result;
 use geo_types::Rect;
 use parking_lot::Mutex;
 use sedona_expr::statistics::GeoStatistics;
-use sedona_geo::to_geo::item_to_geometry;
+use std::{ops::Range, sync::Arc};
 use wkb::reader::Wkb;
 
-use crate::{
-    evaluated_batch::EvaluatedBatch,
-    index::{
-        knn_adapter::{KnnComponents, SedonaKnnAdapter},
-        IndexQueryResult, QueryResultMetrics,
-    },
-    operand_evaluator::{create_operand_evaluator, distance_value_at, OperandEvaluator},
-    refine::{create_refiner, IndexQueryResultRefiner},
-    spatial_predicate::SpatialPredicate,
-    utils::concurrent_reservation::ConcurrentReservation,
-};
+use crate::{evaluated_batch::EvaluatedBatch, index::QueryResultMetrics};
 use arrow::array::BooleanBufferBuilder;
-use sedona_common::{option::SpatialJoinOptions, sedona_internal_err, ExecutionMode};
-
-pub struct SpatialIndex {
-    pub(crate) schema: SchemaRef,
-    pub(crate) options: SpatialJoinOptions,
-
-    /// The spatial predicate evaluator for the spatial predicate.
-    pub(crate) evaluator: Arc<dyn OperandEvaluator>,
-
-    /// The refiner for refining the index query results.
-    pub(crate) refiner: Arc<dyn IndexQueryResultRefiner>,
-
-    /// Memory reservation for tracking the memory usage of the refiner
-    pub(crate) refiner_reservation: ConcurrentReservation,
-
-    /// R-tree index for the geometry batches. It takes MBRs as query windows and returns
-    /// data indexes. These data indexes should be translated using `data_id_to_batch_pos` to get
-    /// the original geometry batch index and row index, or translated using `prepared_geom_idx_vec`
-    /// to get the prepared geometries array index.
-    pub(crate) rtree: RTree<f32>,
-
-    /// Indexed batches containing evaluated geometry arrays. It contains the original record
-    /// batches and geometry arrays obtained by evaluating the geometry expression on the build side.
-    pub(crate) indexed_batches: Vec<EvaluatedBatch>,
-    /// An array for translating rtree data index to geometry batch index and row index
-    pub(crate) data_id_to_batch_pos: Vec<(i32, i32)>,
-
-    /// An array for translating rtree data index to consecutive index. Each geometry may be indexed by
-    /// multiple boxes, so there could be multiple data indexes for the same geometry. A mapping for
-    /// squashing the index makes it easier for persisting per-geometry auxiliary data for evaluating
-    /// the spatial predicate. This is extensively used by the spatial predicate evaluators for storing
-    /// prepared geometries.
-    pub(crate) geom_idx_vec: Vec<usize>,
-
-    /// Shared bitmap builders for visited build side indices, one per batch
-    pub(crate) visited_build_side: Option<Mutex<Vec<BooleanBufferBuilder>>>,
-
-    /// Counter of running probe-threads, potentially able to update `bitmap`.
-    /// Each time a probe thread finished probing the index, it will decrement the counter.
-    /// The last finished probe thread will produce the extra output batches for unmatched
-    /// build side when running left-outer joins. See also [`report_probe_completed`].
-    pub(crate) probe_threads_counter: AtomicUsize,
-
-    /// Shared KNN components (distance metrics and geometry cache) for efficient KNN queries
-    pub(crate) knn_components: Option<KnnComponents>,
-
-    /// Memory reservation for tracking the memory usage of the spatial index
-    /// Cleared on `SpatialIndex` drop
-    #[expect(dead_code)]
-    pub(crate) reservation: MemoryReservation,
+use datafusion_physical_plan::metrics;
+use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder};
+use sedona_common::ExecutionMode;
+
+/// Metrics for the build phase of the spatial join.
+#[derive(Clone, Debug, Default)]
+pub struct SpatialJoinBuildMetrics {
+    /// Total time for collecting build-side of join
+    pub(crate) build_time: metrics::Time,
+    /// Memory used by the spatial-index in bytes
+    pub(crate) build_mem_used: metrics::Gauge,
 }
 
-impl SpatialIndex {
-    pub fn empty(
-        spatial_predicate: SpatialPredicate,
-        schema: SchemaRef,
-        options: SpatialJoinOptions,
-        probe_threads_counter: AtomicUsize,
-        mut reservation: MemoryReservation,
-        memory_pool: Arc<dyn MemoryPool>,
-    ) -> Self {
-        let evaluator = create_operand_evaluator(&spatial_predicate, options.clone());
-        let refiner = create_refiner(
-            options.spatial_library,
-            &spatial_predicate,
-            options.clone(),
-            0,
-            GeoStatistics::empty(),
-        );
-        let refiner_reservation = reservation.split(0);
-        let refiner_reservation = ConcurrentReservation::try_new(0, refiner_reservation).unwrap();
-        let rtree = RTreeBuilder::<f32>::new(0).finish::<HilbertSort>();
-        let knn_components = matches!(spatial_predicate, SpatialPredicate::KNearestNeighbors(_))
-            .then(|| KnnComponents::new(0, &[], memory_pool.clone()).unwrap());
+impl SpatialJoinBuildMetrics {
+    pub fn new(partition: usize, metrics: &ExecutionPlanMetricsSet) -> Self {
         Self {
-            schema,
-            options,
-            evaluator,
-            refiner,
-            refiner_reservation,
-            rtree,
-            data_id_to_batch_pos: Vec::new(),
-            indexed_batches: Vec::new(),
-            geom_idx_vec: Vec::new(),
-            visited_build_side: None,
-            probe_threads_counter,
-            knn_components,
-            reservation,
+            build_time: MetricBuilder::new(metrics).subset_time("build_time", partition),
+            build_mem_used: MetricBuilder::new(metrics).gauge("build_mem_used", partition),
         }
     }
+}
 
-    pub(crate) fn schema(&self) -> SchemaRef {
-        self.schema.clone()
-    }
+#[async_trait]
+pub(crate) trait SpatialIndex {
+    fn schema(&self) -> SchemaRef;
 
-    /// Create a KNN geometry accessor for accessing geometries with caching
-    fn create_knn_accessor(&self) -> Result<SedonaKnnAdapter<'_>> {
-        let Some(knn_components) = self.knn_components.as_ref() else {
-            return sedona_internal_err!("knn_components is not initialized when running KNN join");
-        };
-        Ok(SedonaKnnAdapter::new(
-            &self.indexed_batches,
-            &self.data_id_to_batch_pos,
-            knn_components,
-        ))
-    }
+    /// Get all the indexed batches.
+    #[allow(dead_code)] // used in tests
+    fn get_num_indexed_batches(&self) -> usize;
 
     /// Get the batch at the given index.
-    pub(crate) fn get_indexed_batch(&self, batch_idx: usize) -> &RecordBatch {
-        &self.indexed_batches[batch_idx].batch
-    }
-
+    fn get_indexed_batch(&self, batch_idx: usize) -> &RecordBatch;
     /// Query the spatial index with a probe geometry to find matching build-side geometries.
     ///
     /// This method implements a two-phase spatial join query:
@@ -184,32 +77,14 @@ impl SpatialIndex {
     /// # Returns
     /// * `JoinResultMetrics` containing the number of actual matches (`count`) and the number
     ///   of candidates from the filter phase (`candidate_count`)
-    #[allow(unused)]
-    pub(crate) fn query(
+    #[allow(dead_code)] // for future use
+    fn query(
         &self,
         probe_wkb: &Wkb,
         probe_rect: &Rect<f32>,
         distance: &Option<f64>,
         build_batch_positions: &mut Vec<(i32, i32)>,
-    ) -> Result<QueryResultMetrics> {
-        let min = probe_rect.min();
-        let max = probe_rect.max();
-        let mut candidates = self.rtree.search(min.x, min.y, max.x, max.y);
-        if candidates.is_empty() {
-            return Ok(QueryResultMetrics {
-                count: 0,
-                candidate_count: 0,
-            });
-        }
-
-        // Sort and dedup candidates to avoid duplicate results when we index one geometry
-        // using several boxes.
-        candidates.sort_unstable();
-        candidates.dedup();
-
-        // Refine the candidates retrieved from the r-tree index by evaluating the actual spatial predicate
-        self.refine(probe_wkb, &candidates, distance, build_batch_positions)
-    }
+    ) -> Result<QueryResultMetrics>;
 
     /// Query the spatial index for k nearest neighbors of a given geometry.
     ///
@@ -229,192 +104,14 @@ impl SpatialIndex {
     /// # Returns
     ///
     /// * `JoinResultMetrics` containing the number of actual matches and candidates processed
-    pub(crate) fn query_knn(
+    fn query_knn(
         &self,
         probe_wkb: &Wkb,
         k: u32,
         use_spheroid: bool,
         include_tie_breakers: bool,
         build_batch_positions: &mut Vec<(i32, i32)>,
-    ) -> Result<QueryResultMetrics> {
-        if k == 0 {
-            return Ok(QueryResultMetrics {
-                count: 0,
-                candidate_count: 0,
-            });
-        }
-
-        // Check if index is empty
-        if self.indexed_batches.is_empty() || self.data_id_to_batch_pos.is_empty() {
-            return Ok(QueryResultMetrics {
-                count: 0,
-                candidate_count: 0,
-            });
-        }
-
-        // Convert probe WKB to geo::Geometry
-        let probe_geom = match item_to_geometry(probe_wkb) {
-            Ok(geom) => geom,
-            Err(_) => {
-                // Empty or unsupported geometries (e.g., POINT EMPTY) return empty results
-                return Ok(QueryResultMetrics {
-                    count: 0,
-                    candidate_count: 0,
-                });
-            }
-        };
-
-        // Select the appropriate distance metric
-        let distance_metric: &dyn DistanceMetric<f32> = {
-            let Some(knn_components) = self.knn_components.as_ref() else {
-                return sedona_internal_err!(
-                    "knn_components is not initialized when running KNN join"
-                );
-            };
-            if use_spheroid {
-                &knn_components.haversine_metric
-            } else {
-                &knn_components.euclidean_metric
-            }
-        };
-
-        // Create geometry accessor for on-demand WKB decoding and caching
-        let geometry_accessor = self.create_knn_accessor()?;
-
-        // Use neighbors_geometry to find k nearest neighbors
-        let initial_results = self.rtree.neighbors_geometry(
-            &probe_geom,
-            Some(k as usize),
-            None, // no max_distance filter
-            distance_metric,
-            &geometry_accessor,
-        );
-
-        if initial_results.is_empty() {
-            return Ok(QueryResultMetrics {
-                count: 0,
-                candidate_count: 0,
-            });
-        }
-
-        let mut final_results = initial_results;
-        let mut candidate_count = final_results.len();
-
-        // Handle tie-breakers if enabled
-        if include_tie_breakers && !final_results.is_empty() && k > 0 {
-            // Calculate distances for the initial k results to find the k-th distance
-            let mut distances_with_indices: Vec<(f64, u32)> = Vec::new();
-
-            for &result_idx in &final_results {
-                if (result_idx as usize) < self.data_id_to_batch_pos.len() {
-                    if let Some(item_geom) = geometry_accessor.get_geometry(result_idx as usize) {
-                        let distance = distance_metric.distance_to_geometry(&probe_geom, item_geom);
-                        if let Some(distance_f64) = distance.to_f64() {
-                            distances_with_indices.push((distance_f64, result_idx));
-                        }
-                    }
-                }
-            }
-
-            // Sort by distance
-            distances_with_indices
-                .sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
-
-            // Find the k-th distance (if we have at least k results)
-            if distances_with_indices.len() >= k as usize {
-                let k_idx = (k as usize)
-                    .min(distances_with_indices.len())
-                    .saturating_sub(1);
-                let max_distance = distances_with_indices[k_idx].0;
-
-                // For tie-breakers, create spatial envelope around probe centroid and use rtree.search()
-
-                // Create envelope bounds by expanding the probe bounding box by max_distance
-                let Some(rect) = probe_geom.bounding_rect() else {
-                    // If bounding rectangle cannot be computed, return empty results
-                    return Ok(QueryResultMetrics {
-                        count: 0,
-                        candidate_count: 0,
-                    });
-                };
-
-                let min = rect.min();
-                let max = rect.max();
-                let (min_x, min_y, max_x, max_y) = f64_box_to_f32(min.x, min.y, max.x, max.y);
-                let mut distance_f32 = max_distance as f32;
-                if (distance_f32 as f64) < max_distance {
-                    distance_f32 = distance_f32.next_after(f32::INFINITY);
-                }
-                let (min_x, min_y, max_x, max_y) = (
-                    min_x - distance_f32,
-                    min_y - distance_f32,
-                    max_x + distance_f32,
-                    max_y + distance_f32,
-                );
-
-                // Use rtree.search() with envelope bounds (like the old code)
-                let expanded_results = self.rtree.search(min_x, min_y, max_x, max_y);
-
-                candidate_count = expanded_results.len();
-
-                // Calculate distances for all results and find ties
-                let mut all_distances_with_indices: Vec<(f64, u32)> = Vec::new();
-
-                for &result_idx in &expanded_results {
-                    if (result_idx as usize) < self.data_id_to_batch_pos.len() {
-                        if let Some(item_geom) = geometry_accessor.get_geometry(result_idx as usize)
-                        {
-                            let distance =
-                                distance_metric.distance_to_geometry(&probe_geom, item_geom);
-                            if let Some(distance_f64) = distance.to_f64() {
-                                all_distances_with_indices.push((distance_f64, result_idx));
-                            }
-                        }
-                    }
-                }
-
-                // Sort by distance
-                all_distances_with_indices
-                    .sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
-
-                // Include all results up to and including those with the same distance as the k-th result
-                const DISTANCE_TOLERANCE: f64 = 1e-9;
-                let mut tie_breaker_results: Vec<u32> = Vec::new();
-
-                for (i, &(distance, result_idx)) in all_distances_with_indices.iter().enumerate() {
-                    if i < k as usize {
-                        // Include the first k results
-                        tie_breaker_results.push(result_idx);
-                    } else if (distance - max_distance).abs() <= DISTANCE_TOLERANCE {
-                        // Include tie-breakers (same distance as k-th result)
-                        tie_breaker_results.push(result_idx);
-                    } else {
-                        // No more ties, stop
-                        break;
-                    }
-                }
-
-                final_results = tie_breaker_results;
-            }
-        } else {
-            // When tie-breakers are disabled, limit results to exactly k
-            if final_results.len() > k as usize {
-                final_results.truncate(k as usize);
-            }
-        }
-
-        // Convert results to build_batch_positions using existing data_id_to_batch_pos mapping
-        for &result_idx in &final_results {
-            if (result_idx as usize) < self.data_id_to_batch_pos.len() {
-                build_batch_positions.push(self.data_id_to_batch_pos[result_idx as usize]);
-            }
-        }
-
-        Ok(QueryResultMetrics {
-            count: final_results.len(),
-            candidate_count,
-        })
-    }
+    ) -> Result<QueryResultMetrics>;
 
     /// Query the spatial index with a batch of probe geometries to find matching build-side geometries.
     ///
@@ -435,1549 +132,42 @@ impl SpatialIndex {
     /// * `probe_indices` - Output vector that will be populated with the probe row index (in
     ///   `evaluated_batch`) for each match appended to `build_batch_positions`.
     ///   This means the probe index is repeated `N` times when a probe geometry produces `N` matches,
-    ///   keeping `probe_indices.len()` in sync with `build_batch_positions.len()`.
+    ///   keeping `probe_indices.len()` in sync with `build_batch_positions.len()`. `probe_indices` should be sorted in **ascending order**.
     ///
     /// # Returns
     /// * A tuple containing:
     ///   - `QueryResultMetrics`: Aggregated metrics (total matches and candidates) for the processed rows
     ///   - `usize`: The index of the next row to process (exclusive end of the processed range)
-    pub(crate) async fn query_batch(
-        self: &Arc<Self>,
+    async fn query_batch(
+        &self,
         evaluated_batch: &Arc<EvaluatedBatch>,
         range: Range<usize>,
         max_result_size: usize,
         build_batch_positions: &mut Vec<(i32, i32)>,
         probe_indices: &mut Vec<u32>,
-    ) -> Result<(QueryResultMetrics, usize)> {
-        if range.is_empty() {
-            return Ok((
-                QueryResultMetrics {
-                    count: 0,
-                    candidate_count: 0,
-                },
-                range.start,
-            ));
-        }
-
-        let rects = evaluated_batch.rects();
-        let dist = evaluated_batch.distance();
-        let mut total_candidates_count = 0;
-        let mut total_count = 0;
-        let mut current_row_idx = range.start;
-        for row_idx in range {
-            current_row_idx = row_idx;
-            let Some(probe_rect) = rects[row_idx] else {
-                continue;
-            };
-
-            let min = probe_rect.min();
-            let max = probe_rect.max();
-            let mut candidates = self.rtree.search(min.x, min.y, max.x, max.y);
-            if candidates.is_empty() {
-                continue;
-            }
-
-            let Some(probe_wkb) = evaluated_batch.wkb(row_idx) else {
-                return sedona_internal_err!(
-                    "Failed to get WKB for row {} in evaluated batch",
-                    row_idx
-                );
-            };
-
-            // Sort and dedup candidates to avoid duplicate results when we index one geometry
-            // using several boxes.
-            candidates.sort_unstable();
-            candidates.dedup();
-
-            let distance = match dist {
-                Some(dist_array) => distance_value_at(dist_array, row_idx)?,
-                None => None,
-            };
-
-            // Refine the candidates retrieved from the r-tree index by evaluating the actual spatial predicate
-            let refine_chunk_size = self.options.parallel_refinement_chunk_size;
-            if refine_chunk_size == 0 || candidates.len() < refine_chunk_size * 2 {
-                // For small candidate sets, use refine synchronously
-                let metrics =
-                    self.refine(probe_wkb, &candidates, &distance, build_batch_positions)?;
-                probe_indices.extend(std::iter::repeat_n(row_idx as u32, metrics.count));
-                total_count += metrics.count;
-                total_candidates_count += metrics.candidate_count;
-            } else {
-                // For large candidate sets, spawn several tasks to parallelize refinement
-                let (metrics, positions) = self
-                    .refine_concurrently(
-                        evaluated_batch,
-                        row_idx,
-                        &candidates,
-                        distance,
-                        refine_chunk_size,
-                    )
-                    .await?;
-                build_batch_positions.extend(positions);
-                probe_indices.extend(std::iter::repeat_n(row_idx as u32, metrics.count));
-                total_count += metrics.count;
-                total_candidates_count += metrics.candidate_count;
-            }
-
-            if total_count >= max_result_size {
-                break;
-            }
-        }
-
-        let end_idx = current_row_idx + 1;
-        Ok((
-            QueryResultMetrics {
-                count: total_count,
-                candidate_count: total_candidates_count,
-            },
-            end_idx,
-        ))
-    }
-
-    async fn refine_concurrently(
-        self: &Arc<Self>,
-        evaluated_batch: &Arc<EvaluatedBatch>,
-        row_idx: usize,
-        candidates: &[u32],
-        distance: Option<f64>,
-        refine_chunk_size: usize,
-    ) -> Result<(QueryResultMetrics, Vec<(i32, i32)>)> {
-        let mut join_set = JoinSet::new();
-        for (i, chunk) in candidates.chunks(refine_chunk_size).enumerate() {
-            let cloned_evaluated_batch = Arc::clone(evaluated_batch);
-            let chunk = chunk.to_vec();
-            let index_ref = Arc::clone(self);
-            join_set.spawn(async move {
-                let Some(probe_wkb) = cloned_evaluated_batch.wkb(row_idx) else {
-                    return (
-                        i,
-                        sedona_internal_err!(
-                            "Failed to get WKB for row {} in evaluated batch",
-                            row_idx
-                        ),
-                    );
-                };
-                let mut local_positions: Vec<(i32, i32)> = Vec::with_capacity(chunk.len());
-                let res = index_ref.refine(probe_wkb, &chunk, &distance, &mut local_positions);
-                (i, res.map(|r| (r, local_positions)))
-            });
-        }
-
-        // Collect the results in order
-        let mut refine_results = Vec::with_capacity(join_set.len());
-        refine_results.resize_with(join_set.len(), || None);
-        while let Some(res) = join_set.join_next().await {
-            let (chunk_idx, refine_res) =
-                res.map_err(|e| DataFusionError::External(Box::new(e)))?;
-            let (metrics, positions) = refine_res?;
-            refine_results[chunk_idx] = Some((metrics, positions));
-        }
-
-        let mut total_metrics = QueryResultMetrics {
-            count: 0,
-            candidate_count: 0,
-        };
-        let mut all_positions = Vec::with_capacity(candidates.len());
-        for res in refine_results {
-            let (metrics, positions) = res.expect("All chunks should be processed");
-            total_metrics.count += metrics.count;
-            total_metrics.candidate_count += metrics.candidate_count;
-            all_positions.extend(positions);
-        }
-
-        Ok((total_metrics, all_positions))
-    }
-
-    fn refine(
-        &self,
-        probe_wkb: &Wkb,
-        candidates: &[u32],
-        distance: &Option<f64>,
-        build_batch_positions: &mut Vec<(i32, i32)>,
-    ) -> Result<QueryResultMetrics> {
-        let candidate_count = candidates.len();
-
-        let mut index_query_results = Vec::with_capacity(candidate_count);
-        for data_idx in candidates {
-            let pos = self.data_id_to_batch_pos[*data_idx as usize];
-            let (batch_idx, row_idx) = pos;
-            let indexed_batch = &self.indexed_batches[batch_idx as usize];
-            let build_wkb = indexed_batch.wkb(row_idx as usize);
-            let Some(build_wkb) = build_wkb else {
-                continue;
-            };
-            let distance = self.evaluator.resolve_distance(
-                indexed_batch.distance(),
-                row_idx as usize,
-                distance,
-            )?;
-            let geom_idx = self.geom_idx_vec[*data_idx as usize];
-            index_query_results.push(IndexQueryResult {
-                wkb: build_wkb,
-                distance,
-                geom_idx,
-                position: pos,
-            });
-        }
-
-        if index_query_results.is_empty() {
-            return Ok(QueryResultMetrics {
-                count: 0,
-                candidate_count,
-            });
-        }
-
-        let results = self.refiner.refine(probe_wkb, &index_query_results)?;
-        let num_results = results.len();
-        build_batch_positions.extend(results);
-
-        // Update refiner memory reservation
-        self.refiner_reservation.resize(self.refiner.mem_usage())?;
-
-        Ok(QueryResultMetrics {
-            count: num_results,
-            candidate_count,
-        })
-    }
+    ) -> Result<(QueryResultMetrics, usize)>;
 
     /// Check if the index needs more probe statistics to determine the optimal execution mode.
     ///
     /// # Returns
     /// * `bool` - `true` if the index needs more probe statistics, `false` otherwise.
-    pub(crate) fn need_more_probe_stats(&self) -> bool {
-        self.refiner.need_more_probe_stats()
-    }
-
+    fn need_more_probe_stats(&self) -> bool;
     /// Merge the probe statistics into the index.
     ///
     /// # Arguments
     /// * `stats` - The probe statistics to merge.
-    pub(crate) fn merge_probe_stats(&self, stats: GeoStatistics) {
-        self.refiner.merge_probe_stats(stats);
-    }
+    fn merge_probe_stats(&self, stats: GeoStatistics);
 
-    /// Get the bitmaps for tracking visited left-side indices. The bitmaps will be updated
+    /// Get the bitmaps for tracking visited build-side indices. The bitmaps will be updated
     /// by the spatial join stream when producing output batches during index probing phase.
-    pub(crate) fn visited_build_side(&self) -> Option<&Mutex<Vec<BooleanBufferBuilder>>> {
-        self.visited_build_side.as_ref()
-    }
-
+    fn visited_build_side(&self) -> Option<&Mutex<Vec<BooleanBufferBuilder>>>;
     /// Decrements counter of running threads, and returns `true`
     /// if caller is the last running thread
-    pub(crate) fn report_probe_completed(&self) -> bool {
-        self.probe_threads_counter.fetch_sub(1, Ordering::Relaxed) == 1
-    }
-
+    fn report_probe_completed(&self) -> bool;
     /// Get the memory usage of the refiner in bytes.
-    pub(crate) fn get_refiner_mem_usage(&self) -> usize {
-        self.refiner.mem_usage()
-    }
-
+    fn get_refiner_mem_usage(&self) -> usize;
     /// Get the actual execution mode used by the refiner
-    pub(crate) fn get_actual_execution_mode(&self) -> ExecutionMode {
-        self.refiner.actual_execution_mode()
-    }
+    fn get_actual_execution_mode(&self) -> ExecutionMode;
 }
 
-#[cfg(test)]
-mod tests {
-    use crate::{
-        index::{SpatialIndexBuilder, SpatialJoinBuildMetrics},
-        operand_evaluator::EvaluatedGeometryArray,
-        spatial_predicate::{KNNPredicate, RelationPredicate, SpatialRelationType},
-    };
-
-    use super::*;
-    use arrow_array::RecordBatch;
-    use arrow_schema::{DataType, Field};
-    use datafusion_common::JoinSide;
-    use datafusion_execution::memory_pool::GreedyMemoryPool;
-    use datafusion_expr::JoinType;
-    use datafusion_physical_expr::expressions::Column;
-    use geo_traits::Dimensions;
-    use sedona_common::option::{ExecutionMode, SpatialJoinOptions};
-    use sedona_geometry::wkb_factory::write_wkb_empty_point;
-    use sedona_schema::datatypes::WKB_GEOMETRY;
-    use sedona_testing::create::create_array;
-
-    #[test]
-    fn test_spatial_index_builder_empty() {
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-        let schema = Arc::new(arrow_schema::Schema::empty());
-        let spatial_predicate = SpatialPredicate::Relation(RelationPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            SpatialRelationType::Intersects,
-        ));
-
-        let builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        // Test finishing with empty data
-        let index = builder.finish().unwrap();
-        assert_eq!(index.schema(), schema);
-        assert_eq!(index.indexed_batches.len(), 0);
-    }
-
-    #[test]
-    fn test_spatial_index_builder_add_batch() {
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::Relation(RelationPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            SpatialRelationType::Intersects,
-        ));
-
-        // Create a simple test geometry batch
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-        let geom_batch = create_array(
-            &[
-                Some("POINT (0.25 0.25)"),
-                Some("POINT (10 10)"),
-                None,
-                Some("POINT (0.25 0.25)"),
-            ],
-            &WKB_GEOMETRY,
-        );
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-        assert_eq!(index.schema(), schema);
-        assert_eq!(index.indexed_batches.len(), 1);
-    }
-
-    #[test]
-    fn test_knn_query_execution_with_sample_data() {
-        // Create a spatial index with sample geometry data
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        // Create sample geometry data - points at known locations
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Create geometries at different distances from the query point (0, 0)
-        let geom_batch = create_array(
-            &[
-                Some("POINT (1 0)"), // Distance: 1.0
-                Some("POINT (0 2)"), // Distance: 2.0
-                Some("POINT (3 0)"), // Distance: 3.0
-                Some("POINT (0 4)"), // Distance: 4.0
-                Some("POINT (5 0)"), // Distance: 5.0
-                Some("POINT (2 2)"), // Distance: ~2.83
-                Some("POINT (1 1)"), // Distance: ~1.41
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Create a query geometry at origin (0, 0)
-        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test KNN query with k=3
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                query_wkb,
-                3,     // k=3
-                false, // use_spheroid=false
-                false, // include_tie_breakers=false
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Verify we got 3 results
-        assert_eq!(build_positions.len(), 3);
-        assert_eq!(result.count, 3);
-        assert!(result.candidate_count >= 3);
-
-        // Create a mapping of positions to verify correct ordering
-        // We expect the 3 closest points: (1,0), (1,1), (0,2)
-        let expected_closest_indices = vec![0, 6, 1]; // Based on our sample data ordering
-        let mut found_indices = Vec::new();
-
-        for (_batch_idx, row_idx) in &build_positions {
-            found_indices.push(*row_idx as usize);
-        }
-
-        // Sort to compare sets (order might vary due to implementation)
-        found_indices.sort();
-        let mut expected_sorted = expected_closest_indices;
-        expected_sorted.sort();
-
-        assert_eq!(found_indices, expected_sorted);
-    }
-
-    #[test]
-    fn test_knn_query_execution_with_different_k_values() {
-        // Create spatial index with more data points
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Create 10 points at regular intervals
-        let geom_batch = create_array(
-            &[
-                Some("POINT (1 0)"),  // 0: Distance 1
-                Some("POINT (2 0)"),  // 1: Distance 2
-                Some("POINT (3 0)"),  // 2: Distance 3
-                Some("POINT (4 0)"),  // 3: Distance 4
-                Some("POINT (5 0)"),  // 4: Distance 5
-                Some("POINT (6 0)"),  // 5: Distance 6
-                Some("POINT (7 0)"),  // 6: Distance 7
-                Some("POINT (8 0)"),  // 7: Distance 8
-                Some("POINT (9 0)"),  // 8: Distance 9
-                Some("POINT (10 0)"), // 9: Distance 10
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Query point at origin
-        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test different k values
-        for k in [1, 3, 5, 7, 10] {
-            let mut build_positions = Vec::new();
-            let result = index
-                .query_knn(query_wkb, k, false, false, &mut build_positions)
-                .unwrap();
-
-            // Verify we got exactly k results (or all available if k > total)
-            let expected_results = std::cmp::min(k as usize, 10);
-            assert_eq!(build_positions.len(), expected_results);
-            assert_eq!(result.count, expected_results);
-
-            // Verify the results are the k closest points
-            let mut row_indices: Vec<usize> = build_positions
-                .iter()
-                .map(|(_, row_idx)| *row_idx as usize)
-                .collect();
-            row_indices.sort();
-
-            let expected_indices: Vec<usize> = (0..expected_results).collect();
-            assert_eq!(row_indices, expected_indices);
-        }
-    }
-
-    #[test]
-    fn test_knn_query_execution_with_spheroid_distance() {
-        // Create spatial index
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            true,
-            JoinSide::Left,
-        ));
-
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Create points with geographic coordinates (longitude, latitude)
-        let geom_batch = create_array(
-            &[
-                Some("POINT (-74.0 40.7)"), // NYC area
-                Some("POINT (-73.9 40.7)"), // Slightly east
-                Some("POINT (-74.1 40.7)"), // Slightly west
-                Some("POINT (-74.0 40.8)"), // Slightly north
-                Some("POINT (-74.0 40.6)"), // Slightly south
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Query point at NYC
-        let query_geom = create_array(&[Some("POINT (-74.0 40.7)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test with planar distance (spheroid distance is not supported)
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                query_wkb,
-                3,     // k=3
-                false, // use_spheroid=false (only supported option)
-                false,
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Should find results with planar distance calculation
-        assert!(!build_positions.is_empty()); // At least the exact match
-        assert!(result.count >= 1);
-        assert!(result.candidate_count >= 1);
-
-        // Test that spheroid distance now works with Haversine metric
-        let mut build_positions_spheroid = Vec::new();
-        let result_spheroid = index.query_knn(
-            query_wkb,
-            3,    // k=3
-            true, // use_spheroid=true (now supported with Haversine)
-            false,
-            &mut build_positions_spheroid,
-        );
-
-        // Should succeed and return results
-        assert!(result_spheroid.is_ok());
-        let result_spheroid = result_spheroid.unwrap();
-        assert!(!build_positions_spheroid.is_empty());
-        assert!(result_spheroid.count >= 1);
-        assert!(result_spheroid.candidate_count >= 1);
-    }
-
-    #[test]
-    fn test_knn_query_execution_edge_cases() {
-        // Create spatial index
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Create sample data with some edge cases
-        let geom_batch = create_array(
-            &[
-                Some("POINT (1 1)"),
-                Some("POINT (2 2)"),
-                None, // NULL geometry
-                Some("POINT (3 3)"),
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test k=0 (should return no results)
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                query_wkb,
-                0, // k=0
-                false,
-                false,
-                &mut build_positions,
-            )
-            .unwrap();
-
-        assert_eq!(build_positions.len(), 0);
-        assert_eq!(result.count, 0);
-        assert_eq!(result.candidate_count, 0);
-
-        // Test k > available geometries
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                query_wkb,
-                10, // k=10, but only 3 valid geometries available
-                false,
-                false,
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Should return all available valid geometries (excluding NULL)
-        assert_eq!(build_positions.len(), 3);
-        assert_eq!(result.count, 3);
-    }
-
-    #[test]
-    fn test_knn_query_execution_empty_index() {
-        // Create empty spatial index
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-        let schema = Arc::new(arrow_schema::Schema::empty());
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        let builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Try to query empty index
-        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(query_wkb, 5, false, false, &mut build_positions)
-            .unwrap();
-
-        // Should return no results for empty index
-        assert_eq!(build_positions.len(), 0);
-        assert_eq!(result.count, 0);
-        assert_eq!(result.candidate_count, 0);
-    }
-
-    #[test]
-    fn test_knn_query_execution_with_tie_breakers() {
-        // Create a spatial index with sample geometry data
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            1, // probe_threads_count
-            memory_pool.clone(),
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Create points where we have more ties at the k-th distance
-        // Query point is at (0.0, 0.0)
-        // We'll create a scenario with k=2 where there are 3 points at the same distance
-        // This ensures the tie-breaker logic has work to do
-        let geom_batch = create_array(
-            &[
-                Some("POINT (1.0 0.0)"),  // Squared distance 1.0
-                Some("POINT (0.0 1.0)"),  // Squared distance 1.0 (tie!)
-                Some("POINT (-1.0 0.0)"), // Squared distance 1.0 (tie!)
-                Some("POINT (0.0 -1.0)"), // Squared distance 1.0 (tie!)
-                Some("POINT (2.0 0.0)"),  // Squared distance 4.0
-                Some("POINT (0.0 2.0)"),  // Squared distance 4.0
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Query point at the origin (0.0, 0.0)
-        let query_geom = create_array(&[Some("POINT (0.0 0.0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test without tie-breakers: should return exactly k=2 results
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                query_wkb,
-                2,     // k=2
-                false, // use_spheroid
-                false, // include_tie_breakers
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Should return exactly 2 results (the closest point + 1 of the tied points)
-        assert_eq!(result.count, 2);
-        assert_eq!(build_positions.len(), 2);
-
-        // Test with tie-breakers: should return k=2 plus all ties
-        let mut build_positions_with_ties = Vec::new();
-        let result_with_ties = index
-            .query_knn(
-                query_wkb,
-                2,     // k=2
-                false, // use_spheroid
-                true,  // include_tie_breakers
-                &mut build_positions_with_ties,
-            )
-            .unwrap();
-
-        // Should return more than 2 results because of ties
-        // We have 4 points at squared distance 1.0 (all tied for closest)
-        // With k=2 and tie-breakers:
-        // - Initial neighbors query returns 2 of the 4 tied points
-        // - Tie-breaker logic should find the other 2 tied points
-        // - Total should be 4 results (all points at distance 1.0)
-
-        // With 4 points all at the same distance and k=2:
-        // - Without tie-breakers: should return exactly 2
-        // - With tie-breakers: should return all 4 tied points
-        assert_eq!(
-            result.count, 2,
-            "Without tie-breakers should return exactly k=2"
-        );
-        assert_eq!(
-            result_with_ties.count, 4,
-            "With tie-breakers should return all 4 tied points"
-        );
-        assert_eq!(build_positions_with_ties.len(), 4);
-    }
-
-    #[test]
-    fn test_query_knn_with_geometry_distance() {
-        // Create a spatial index with sample geometry data
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        // Create sample geometry data - points at known locations
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Create geometries at different distances from the query point (0, 0)
-        let geom_batch = create_array(
-            &[
-                Some("POINT (1 0)"), // Distance: 1.0
-                Some("POINT (0 2)"), // Distance: 2.0
-                Some("POINT (3 0)"), // Distance: 3.0
-                Some("POINT (0 4)"), // Distance: 4.0
-                Some("POINT (5 0)"), // Distance: 5.0
-                Some("POINT (2 2)"), // Distance: ~2.83
-                Some("POINT (1 1)"), // Distance: ~1.41
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Create a query geometry at origin (0, 0)
-        let query_geom = create_array(&[Some("POINT (0 0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test the geometry-based query_knn method with k=3
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                query_wkb,
-                3,     // k=3
-                false, // use_spheroid=false
-                false, // include_tie_breakers=false
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Verify we got results (should be 3 or less)
-        assert!(!build_positions.is_empty());
-        assert!(build_positions.len() <= 3);
-        assert!(result.count > 0);
-        assert!(result.count <= 3);
-    }
-
-    #[test]
-    fn test_query_knn_with_mixed_geometries() {
-        // Create a spatial index with complex geometries where geometry-based
-        // distance should differ from centroid-based distance
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        // Create different geometry types
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Mix of points and linestrings
-        let geom_batch = create_array(
-            &[
-                Some("POINT (1 1)"),               // Simple point
-                Some("LINESTRING (2 0, 2 4)"),     // Vertical line - closest point should be (2, 1)
-                Some("LINESTRING (10 10, 10 20)"), // Far away line
-                Some("POINT (5 5)"),               // Far point
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Query point close to the linestring
-        let query_geom = create_array(&[Some("POINT (2.1 1.0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test the geometry-based KNN method with mixed geometry types
-        let mut build_positions = Vec::new();
-
-        let result = index
-            .query_knn(
-                query_wkb,
-                2,     // k=2
-                false, // use_spheroid=false
-                false, // include_tie_breakers=false
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Should return results
-        assert!(!build_positions.is_empty());
-
-        // Should work with mixed geometry types
-        assert!(result.count > 0);
-    }
-
-    #[test]
-    fn test_query_knn_with_tie_breakers_geometry_distance() {
-        // Create a spatial index with geometries that have identical distances for tie-breaker testing
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            4,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        // Create points where we have multiple points at the same distance from the query point
-        // Query point will be at (0, 0), and we'll have 4 points all at distance sqrt(2) ≈ 1.414
-        let geom_batch = create_array(
-            &[
-                Some("POINT (1.0 1.0)"),   // Distance: sqrt(2)
-                Some("POINT (1.0 -1.0)"),  // Distance: sqrt(2) - tied with above
-                Some("POINT (-1.0 1.0)"),  // Distance: sqrt(2) - tied with above
-                Some("POINT (-1.0 -1.0)"), // Distance: sqrt(2) - tied with above
-                Some("POINT (2.0 0.0)"),   // Distance: 2.0 - farther away
-                Some("POINT (0.0 2.0)"),   // Distance: 2.0 - farther away
-            ],
-            &WKB_GEOMETRY,
-        );
-
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Query point at the origin (0.0, 0.0)
-        let query_geom = create_array(&[Some("POINT (0.0 0.0)")], &WKB_GEOMETRY);
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // Test without tie-breakers: should return exactly k=2 results
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                query_wkb,
-                2,     // k=2
-                false, // use_spheroid
-                false, // include_tie_breakers=false
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Should return exactly 2 results
-        assert_eq!(result.count, 2);
-        assert_eq!(build_positions.len(), 2);
-
-        // Test with tie-breakers: should return all tied points
-        let mut build_positions_with_ties = Vec::new();
-        let result_with_ties = index
-            .query_knn(
-                query_wkb,
-                2,     // k=2
-                false, // use_spheroid
-                true,  // include_tie_breakers=true
-                &mut build_positions_with_ties,
-            )
-            .unwrap();
-
-        // Should return 4 results because of ties (all 4 points at distance sqrt(2))
-        assert!(result_with_ties.count == 4);
-
-        // Query using a box centered at the origin
-        let query_geom = create_array(
-            &[Some(
-                "POLYGON ((-0.5 -0.5, -0.5 0.5, 0.5 0.5, 0.5 -0.5, -0.5 -0.5))",
-            )],
-            &WKB_GEOMETRY,
-        );
-        let query_array = EvaluatedGeometryArray::try_new(query_geom, &WKB_GEOMETRY).unwrap();
-        let query_wkb = &query_array.wkbs()[0].as_ref().unwrap();
-
-        // This query should return 4 points
-        let mut build_positions_with_ties = Vec::new();
-        let result_with_ties = index
-            .query_knn(
-                query_wkb,
-                2,     // k=2
-                false, // use_spheroid
-                true,  // include_tie_breakers=true
-                &mut build_positions_with_ties,
-            )
-            .unwrap();
-
-        // Should return 4 results because of ties (all 4 points at distance sqrt(2))
-        assert!(result_with_ties.count == 4);
-    }
-
-    #[test]
-    fn test_knn_query_with_empty_geometry() {
-        // Create a spatial index with sample geometry data like other tests
-        let memory_pool = Arc::new(GreedyMemoryPool::new(1024 * 1024));
-        let options = SpatialJoinOptions {
-            execution_mode: ExecutionMode::PrepareBuild,
-            ..Default::default()
-        };
-        let metrics = SpatialJoinBuildMetrics::default();
-
-        let spatial_predicate = SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
-            Arc::new(Column::new("geom", 0)),
-            Arc::new(Column::new("geom", 1)),
-            5,
-            false,
-            JoinSide::Left,
-        ));
-
-        // Create geometry batch using the same pattern as other tests
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema.clone(),
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            1, // probe_threads_count
-            memory_pool.clone(),
-            metrics,
-        )
-        .unwrap();
-
-        let batch = RecordBatch::new_empty(schema.clone());
-
-        let geom_batch = create_array(
-            &[
-                Some("POINT (0 0)"),
-                Some("POINT (1 1)"),
-                Some("POINT (2 2)"),
-            ],
-            &WKB_GEOMETRY,
-        );
-        let indexed_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_batch, &WKB_GEOMETRY).unwrap(),
-        };
-        builder.add_batch(indexed_batch).unwrap();
-
-        let index = builder.finish().unwrap();
-
-        // Create an empty point WKB
-        let mut empty_point_wkb = Vec::new();
-        write_wkb_empty_point(&mut empty_point_wkb, Dimensions::Xy).unwrap();
-
-        // Query with the empty point
-        let mut build_positions = Vec::new();
-        let result = index
-            .query_knn(
-                &wkb::reader::read_wkb(&empty_point_wkb).unwrap(),
-                2,     // k=2
-                false, // use_spheroid
-                false, // include_tie_breakers
-                &mut build_positions,
-            )
-            .unwrap();
-
-        // Should return empty results for empty geometry
-        assert_eq!(result.count, 0);
-        assert_eq!(result.candidate_count, 0);
-        assert!(build_positions.is_empty());
-    }
-
-    async fn setup_index_for_batch_test(
-        build_geoms: &[Option<&str>],
-        options: SpatialJoinOptions,
-    ) -> Arc<SpatialIndex> {
-        let memory_pool = Arc::new(GreedyMemoryPool::new(100 * 1024 * 1024));
-        let metrics = SpatialJoinBuildMetrics::default();
-        let spatial_predicate = SpatialPredicate::Relation(RelationPredicate::new(
-            Arc::new(Column::new("left", 0)),
-            Arc::new(Column::new("right", 0)),
-            SpatialRelationType::Intersects,
-        ));
-        let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
-            "geom",
-            DataType::Binary,
-            true,
-        )]));
-
-        let mut builder = SpatialIndexBuilder::new(
-            schema,
-            spatial_predicate,
-            options,
-            JoinType::Inner,
-            1,
-            memory_pool,
-            metrics,
-        )
-        .unwrap();
-
-        let geom_array = create_array(build_geoms, &WKB_GEOMETRY);
-        let batch = RecordBatch::try_new(
-            Arc::new(arrow_schema::Schema::new(vec![Field::new(
-                "geom",
-                DataType::Binary,
-                true,
-            )])),
-            vec![Arc::new(geom_array.clone())],
-        )
-        .unwrap();
-        let evaluated_batch = EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_array, &WKB_GEOMETRY).unwrap(),
-        };
-
-        builder.add_batch(evaluated_batch).unwrap();
-        Arc::new(builder.finish().unwrap())
-    }
-
-    fn create_probe_batch(probe_geoms: &[Option<&str>]) -> Arc<EvaluatedBatch> {
-        let geom_array = create_array(probe_geoms, &WKB_GEOMETRY);
-        let batch = RecordBatch::try_new(
-            Arc::new(arrow_schema::Schema::new(vec![Field::new(
-                "geom",
-                DataType::Binary,
-                true,
-            )])),
-            vec![Arc::new(geom_array.clone())],
-        )
-        .unwrap();
-        Arc::new(EvaluatedBatch {
-            batch,
-            geom_array: EvaluatedGeometryArray::try_new(geom_array, &WKB_GEOMETRY).unwrap(),
-        })
-    }
-
-    #[tokio::test]
-    async fn test_query_batch_empty_results() {
-        let build_geoms = &[Some("POINT (0 0)"), Some("POINT (1 1)")];
-        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
-
-        // Probe with geometries that don't intersect
-        let probe_geoms = &[Some("POINT (10 10)"), Some("POINT (20 20)")];
-        let probe_batch = create_probe_batch(probe_geoms);
-
-        let mut build_batch_positions = Vec::new();
-        let mut probe_indices = Vec::new();
-        let (metrics, next_idx) = index
-            .query_batch(
-                &probe_batch,
-                0..2,
-                usize::MAX,
-                &mut build_batch_positions,
-                &mut probe_indices,
-            )
-            .await
-            .unwrap();
-
-        assert_eq!(metrics.count, 0);
-        assert_eq!(build_batch_positions.len(), 0);
-        assert_eq!(probe_indices.len(), 0);
-        assert_eq!(next_idx, 2);
-    }
-
-    #[tokio::test]
-    async fn test_query_batch_max_result_size() {
-        let build_geoms = &[
-            Some("POINT (0 0)"),
-            Some("POINT (0 0)"),
-            Some("POINT (0 0)"),
-        ];
-        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
-
-        // Probe with geometry that intersects all 3
-        let probe_geoms = &[Some("POINT (0 0)"), Some("POINT (0 0)")];
-        let probe_batch = create_probe_batch(probe_geoms);
-
-        // Case 1: Max result size is large enough
-        let mut build_batch_positions = Vec::new();
-        let mut probe_indices = Vec::new();
-        let (metrics, next_idx) = index
-            .query_batch(
-                &probe_batch,
-                0..2,
-                10,
-                &mut build_batch_positions,
-                &mut probe_indices,
-            )
-            .await
-            .unwrap();
-        assert_eq!(metrics.count, 6); // 2 probes * 3 matches
-        assert_eq!(next_idx, 2);
-        assert_eq!(probe_indices, vec![0, 0, 0, 1, 1, 1]);
-
-        // Case 2: Max result size is small (stops after first probe)
-        let mut build_batch_positions = Vec::new();
-        let mut probe_indices = Vec::new();
-        let (metrics, next_idx) = index
-            .query_batch(
-                &probe_batch,
-                0..2,
-                2, // Stop after 2 results
-                &mut build_batch_positions,
-                &mut probe_indices,
-            )
-            .await
-            .unwrap();
-
-        // It should process the first probe, find 3 matches.
-        // Since 3 >= 2, it should stop.
-        assert_eq!(metrics.count, 3);
-        assert_eq!(next_idx, 1); // Only processed 1 probe
-        assert_eq!(probe_indices, vec![0, 0, 0]);
-    }
-
-    #[tokio::test]
-    async fn test_query_batch_parallel_refinement() {
-        // Create enough build geometries to trigger parallel refinement
-        // We need candidates.len() >= chunk_size * 2
-        // Let's set chunk_size = 2, so we need >= 4 candidates.
-        let build_geoms = vec![Some("POINT (0 0)"); 10];
-        let options = SpatialJoinOptions {
-            parallel_refinement_chunk_size: 2,
-            ..Default::default()
-        };
-
-        let index = setup_index_for_batch_test(&build_geoms, options).await;
-
-        // Probe with a geometry that intersects all build geometries
-        let probe_geoms = &[Some("POLYGON ((-1 -1, 1 -1, 1 1, -1 1, -1 -1))")];
-        let probe_batch = create_probe_batch(probe_geoms);
-
-        let mut build_batch_positions = Vec::new();
-        let mut probe_indices = Vec::new();
-        let (metrics, next_idx) = index
-            .query_batch(
-                &probe_batch,
-                0..1,
-                usize::MAX,
-                &mut build_batch_positions,
-                &mut probe_indices,
-            )
-            .await
-            .unwrap();
-
-        assert_eq!(metrics.count, 10);
-        assert_eq!(build_batch_positions.len(), 10);
-        assert_eq!(probe_indices, vec![0; 10]);
-        assert_eq!(next_idx, 1);
-    }
-
-    #[tokio::test]
-    async fn test_query_batch_empty_range() {
-        let build_geoms = &[Some("POINT (0 0)")];
-        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
-        let probe_geoms = &[Some("POINT (0 0)"), Some("POINT (0 0)")];
-        let probe_batch = create_probe_batch(probe_geoms);
-
-        let mut build_batch_positions = Vec::new();
-        let mut probe_indices = Vec::new();
-
-        // Query with empty range
-        for empty_ranges in [0..0, 1..1, 2..2] {
-            let (metrics, next_idx) = index
-                .query_batch(
-                    &probe_batch,
-                    empty_ranges.clone(),
-                    usize::MAX,
-                    &mut build_batch_positions,
-                    &mut probe_indices,
-                )
-                .await
-                .unwrap();
-
-            assert_eq!(metrics.count, 0);
-            assert_eq!(next_idx, empty_ranges.end);
-        }
-    }
-
-    #[tokio::test]
-    async fn test_query_batch_range_offset() {
-        let build_geoms = &[Some("POINT (0 0)"), Some("POINT (1 1)")];
-        let index = setup_index_for_batch_test(build_geoms, SpatialJoinOptions::default()).await;
-
-        // Probe with 3 geometries:
-        // 0: POINT (0 0) - matches build[0] (should be skipped)
-        // 1: POINT (0 0) - matches build[0]
-        // 2: POINT (1 1) - matches build[1]
-        let probe_geoms = &[
-            Some("POINT (0 0)"),
-            Some("POINT (0 0)"),
-            Some("POINT (1 1)"),
-        ];
-        let probe_batch = create_probe_batch(probe_geoms);
-
-        let mut build_batch_positions = Vec::new();
-        let mut probe_indices = Vec::new();
-
-        // Query with range 1..3 (skipping the first probe)
-        let (metrics, next_idx) = index
-            .query_batch(
-                &probe_batch,
-                1..3,
-                usize::MAX,
-                &mut build_batch_positions,
-                &mut probe_indices,
-            )
-            .await
-            .unwrap();
-
-        assert_eq!(metrics.count, 2);
-        assert_eq!(next_idx, 3);
-
-        // probe_indices should contain indices relative to the batch start (1 and 2)
-        assert_eq!(probe_indices, vec![1, 2]);
-
-        // build_batch_positions should contain matches for probe 1 and probe 2
-        // probe 1 matches build 0 (0, 0)
-        // probe 2 matches build 1 (0, 1)
-        // Note: build_batch_positions contains (batch_idx, row_idx)
-        // Since we have 1 batch, batch_idx is 0.
-        assert_eq!(build_batch_positions, vec![(0, 0), (0, 1)]);
-    }
-
-    #[tokio::test]
-    async fn test_query_batch_zero_parallel_refinement_chunk_size() {
-        let build_geoms = &[
-            Some("POINT (0 0)"),
-            Some("POINT (0 0)"),
-            Some("POINT (0 0)"),
-        ];
-        let options = SpatialJoinOptions {
-            // force synchronous refinement
-            parallel_refinement_chunk_size: 0,
-            ..Default::default()
-        };
-
-        let index = setup_index_for_batch_test(build_geoms, options).await;
-        let probe_geoms = &[Some("POINT (0 0)")];
-        let probe_batch = create_probe_batch(probe_geoms);
-
-        let mut build_batch_positions = Vec::new();
-        let mut probe_indices = Vec::new();
-
-        let result = index
-            .query_batch(
-                &probe_batch,
-                0..1,
-                10,
-                &mut build_batch_positions,
-                &mut probe_indices,
-            )
-            .await;
-
-        assert!(result.is_ok());
-        let (metrics, _) = result.unwrap();
-        assert_eq!(metrics.count, 3);
-    }
-}
+pub type SpatialIndexRef = Arc<dyn SpatialIndex + Send + Sync>;
diff --git a/rust/sedona-spatial-join/src/lib.rs b/rust/sedona-spatial-join/src/lib.rs
index 94af3f225..6731efcb0 100644
--- a/rust/sedona-spatial-join/src/lib.rs
+++ b/rust/sedona-spatial-join/src/lib.rs
@@ -31,8 +31,7 @@ pub use exec::SpatialJoinExec;
 pub use optimizer::register_spatial_join_optimizer;
 
 // Re-export types needed for external usage (e.g., in Comet)
-pub use build_index::build_index;
-pub use index::{SpatialIndex, SpatialJoinBuildMetrics};
+pub use index::SpatialJoinBuildMetrics;
 pub use spatial_predicate::SpatialPredicate;
 
 // Re-export option types from sedona-common for convenience
diff --git a/rust/sedona-spatial-join/src/operand_evaluator.rs b/rust/sedona-spatial-join/src/operand_evaluator.rs
index 8b4313962..cb55dbc87 100644
--- a/rust/sedona-spatial-join/src/operand_evaluator.rs
+++ b/rust/sedona-spatial-join/src/operand_evaluator.rs
@@ -25,7 +25,6 @@ use datafusion_common::{
 use datafusion_expr::ColumnarValue;
 use datafusion_physical_expr::PhysicalExpr;
 use float_next_after::NextAfter;
-use geo_index::rtree::util::f64_box_to_f32;
 use geo_types::{coord, Rect};
 use sedona_functions::executor::IterGeo;
 use sedona_geo_generic_alg::BoundingRect;
@@ -102,10 +101,38 @@ pub struct EvaluatedGeometryArray {
     /// but we'll only allow accessing Wkb<'a> where 'a is the lifetime of the GeometryBatchResult to make
     /// the interfaces safe. The buffers in `geometry_array` are allocated on the heap and won't be moved when
     /// the GeometryBatchResult is moved, so we don't need to worry about pinning.
-    wkbs: Vec<Option<Wkb<'static>>>,
+    pub wkbs: Vec<Option<Wkb<'static>>>,
 }
 
 impl EvaluatedGeometryArray {
+    #[cfg(feature = "gpu")]
+    /// Expand the box by two ULPs to ensure the resulting f32 box covers a f64 point that
+    /// is covered by the original f64 box.
+    fn make_conservative_box(
+        min_x: f64,
+        min_y: f64,
+        max_x: f64,
+        max_y: f64,
+    ) -> (f32, f32, f32, f32) {
+        let mut new_min_x = min_x as f32;
+        let mut new_min_y = min_y as f32;
+        let mut new_max_x = max_x as f32;
+        let mut new_max_y = max_y as f32;
+
+        for _ in 0..2 {
+            new_min_x = new_min_x.next_after(f32::NEG_INFINITY);
+            new_min_y = new_min_y.next_after(f32::NEG_INFINITY);
+            new_max_x = new_max_x.next_after(f32::INFINITY);
+            new_max_y = new_max_y.next_after(f32::INFINITY);
+        }
+
+        debug_assert!((new_min_x as f64) <= min_x);
+        debug_assert!((new_min_y as f64) <= min_y);
+        debug_assert!((new_max_x as f64) >= max_x);
+        debug_assert!((new_max_y as f64) >= max_y);
+
+        (new_min_x, new_min_y, new_max_x, new_max_y)
+    }
     pub fn try_new(geometry_array: ArrayRef, sedona_type: &SedonaType) -> Result<Self> {
         let num_rows = geometry_array.len();
         let mut rect_vec = Vec::with_capacity(num_rows);
@@ -115,10 +142,35 @@ impl EvaluatedGeometryArray {
                 if let Some(rect) = wkb.bounding_rect() {
                     let min = rect.min();
                     let max = rect.max();
-                    // f64_box_to_f32 will ensure the resulting `f32` box is no smaller than the `f64` box.
-                    let (min_x, min_y, max_x, max_y) = f64_box_to_f32(min.x, min.y, max.x, max.y);
-                    let rect = Rect::new(coord!(x: min_x, y: min_y), coord!(x: max_x, y: max_y));
-                    Some(rect)
+                    #[cfg(feature = "gpu")]
+                    {
+                        use wkb::reader::GeometryType;
+                        // For point geometries, we can directly cast f64 to f32 without expanding the box.
+                        // This enables libgpuspatial to treat the Rect as point for faster processing.
+                        if wkb.geometry_type() == GeometryType::Point {
+                            Some(Rect::new(
+                                coord!(x: min.x as f32, y: min.y as f32),
+                                coord!(x: max.x as f32, y: max.y as f32),
+                            ))
+                        } else {
+                            let (min_x, min_y, max_x, max_y) =
+                                Self::make_conservative_box(min.x, min.y, max.x, max.y);
+                            Some(Rect::new(
+                                coord!(x: min_x, y: min_y),
+                                coord!(x: max_x, y: max_y),
+                            ))
+                        }
+                    }
+                    #[cfg(not(feature = "gpu"))]
+                    {
+                        use geo_index::rtree::util::f64_box_to_f32;
+                        // f64_box_to_f32 will ensure the resulting `f32` box is no smaller than the `f64` box.
+                        let (min_x, min_y, max_x, max_y) =
+                            f64_box_to_f32(min.x, min.y, max.x, max.y);
+                        let rect =
+                            Rect::new(coord!(x: min_x, y: min_y), coord!(x: max_x, y: max_y));
+                        Some(rect)
+                    }
                 } else {
                     None
                 }
diff --git a/rust/sedona-spatial-join/src/optimizer.rs b/rust/sedona-spatial-join/src/optimizer.rs
index bd01821b1..5c9143338 100644
--- a/rust/sedona-spatial-join/src/optimizer.rs
+++ b/rust/sedona-spatial-join/src/optimizer.rs
@@ -27,11 +27,11 @@ use datafusion::{
     config::ConfigOptions, execution::session_state::SessionStateBuilder,
     physical_optimizer::PhysicalOptimizerRule,
 };
-use datafusion_common::ScalarValue;
 use datafusion_common::{
     tree_node::{Transformed, TreeNode},
     JoinSide,
 };
+use datafusion_common::{DataFusionError, ScalarValue};
 use datafusion_common::{HashMap, Result};
 use datafusion_expr::{Expr, Filter, Join, JoinType, LogicalPlan, Operator};
 use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal};
@@ -41,7 +41,7 @@ use datafusion_physical_plan::joins::utils::ColumnIndex;
 use datafusion_physical_plan::joins::{HashJoinExec, NestedLoopJoinExec};
 use datafusion_physical_plan::projection::ProjectionExec;
 use datafusion_physical_plan::{joins::utils::JoinFilter, ExecutionPlan};
-use sedona_common::{option::SedonaOptions, sedona_internal_err};
+use sedona_common::{option::SedonaOptions, sedona_internal_err, SpatialJoinOptions};
 use sedona_expr::utils::{parse_distance_predicate, ParsedDistancePredicate};
 use sedona_schema::datatypes::SedonaType;
 use sedona_schema::matchers::ArgMatcher;
@@ -235,19 +235,31 @@ impl SpatialJoinOptimizer {
     fn try_optimize_join(
         &self,
         plan: Arc<dyn ExecutionPlan>,
-        _config: &ConfigOptions,
+        config: &ConfigOptions,
     ) -> Result<Transformed<Arc<dyn ExecutionPlan>>> {
+        let sedona_options = config
+            .extensions
+            .get::<SedonaOptions>()
+            .ok_or_else(|| DataFusionError::Internal("SedonaOptions not found".into()))?;
         // Check if this is a NestedLoopJoinExec that we can convert to spatial join
         if let Some(nested_loop_join) = plan.as_any().downcast_ref::<NestedLoopJoinExec>() {
-            if let Some(spatial_join) = self.try_convert_to_spatial_join(nested_loop_join)? {
+            if let Some(spatial_join) =
+                self.try_convert_to_spatial_join(nested_loop_join, &sedona_options.spatial_join)?
+            {
                 return Ok(Transformed::yes(spatial_join));
             }
         }
 
         // Check if this is a HashJoinExec with spatial filter that we can convert to spatial join
         if let Some(hash_join) = plan.as_any().downcast_ref::<HashJoinExec>() {
-            if let Some(spatial_join) = self.try_convert_hash_join_to_spatial(hash_join)? {
-                return Ok(Transformed::yes(spatial_join));
+            if let Some(spatial_join) =
+                self.try_convert_hash_join_to_spatial(hash_join, &sedona_options.spatial_join)?
+            {
+                if let Some(_spatial_join_exec) =
+                    spatial_join.as_any().downcast_ref::<SpatialJoinExec>()
+                {
+                    return Ok(Transformed::yes(spatial_join));
+                }
             }
         }
 
@@ -261,6 +273,7 @@ impl SpatialJoinOptimizer {
     fn try_convert_to_spatial_join(
         &self,
         nested_loop_join: &NestedLoopJoinExec,
+        options: &SpatialJoinOptions,
     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
         if let Some(join_filter) = nested_loop_join.filter() {
             if let Some((spatial_predicate, remainder)) = transform_join_filter(join_filter) {
@@ -292,6 +305,9 @@ impl SpatialJoinOptimizer {
                     return Ok(None);
                 }
 
+                // Check if we can use GPU for this spatial join
+                let use_gpu = is_using_gpu(&spatial_predicate, options)?;
+
                 // Create the spatial join
                 let spatial_join = SpatialJoinExec::try_new(
                     left,
@@ -300,6 +316,7 @@ impl SpatialJoinOptimizer {
                     remainder,
                     join_type,
                     nested_loop_join.projection().cloned(),
+                    use_gpu,
                 )?;
 
                 return Ok(Some(Arc::new(spatial_join)));
@@ -316,6 +333,7 @@ impl SpatialJoinOptimizer {
     fn try_convert_hash_join_to_spatial(
         &self,
         hash_join: &HashJoinExec,
+        options: &SpatialJoinOptions,
     ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
         // Check if the filter contains spatial predicates
         if let Some(join_filter) = hash_join.filter() {
@@ -341,6 +359,9 @@ impl SpatialJoinOptimizer {
                 // Combine the equi-filter with any existing remainder
                 remainder = self.combine_filters(remainder, equi_filter)?;
 
+                // Check if we can use GPU for this spatial join
+                let use_gpu = is_using_gpu(&spatial_predicate, options)?;
+
                 // Create spatial join where:
                 // - Spatial predicate (ST_KNN) drives the join
                 // - Equi-conditions (c.id = r.id) become filters
@@ -355,6 +376,7 @@ impl SpatialJoinOptimizer {
                     hash_join.join_type(),
                     None, // No projection in SpatialJoinExec
                     true, // converted_from_hash_join = true
+                    use_gpu,
                 )?);
 
                 // Now wrap it with ProjectionExec to match HashJoinExec's output schema exactly
@@ -1054,18 +1076,59 @@ fn is_spatial_predicate_supported(
     }
 }
 
+fn is_using_gpu(
+    spatial_predicate: &SpatialPredicate,
+    join_opts: &SpatialJoinOptions,
+) -> Result<bool> {
+    if join_opts.gpu.enable {
+        if is_spatial_predicate_supported_on_gpu(spatial_predicate) {
+            return Ok(true);
+        } else if join_opts.gpu.fallback_to_cpu {
+            log::warn!(
+                "Falling back to CPU spatial join as the spatial predicate is not supported on GPU"
+            );
+            return Ok(false);
+        } else {
+            return sedona_internal_err!(
+                "GPU spatial join is enabled, but the spatial predicate is not supported on GPU"
+            );
+        }
+    }
+    Ok(false)
+}
+
+fn is_spatial_predicate_supported_on_gpu(spatial_predicate: &SpatialPredicate) -> bool {
+    match spatial_predicate {
+        SpatialPredicate::Relation(rel) => match rel.relation_type {
+            SpatialRelationType::Intersects => true,
+            SpatialRelationType::Contains => true,
+            SpatialRelationType::Within => true,
+            SpatialRelationType::Covers => true,
+            SpatialRelationType::CoveredBy => true,
+            SpatialRelationType::Touches => true,
+            SpatialRelationType::Crosses => false,
+            SpatialRelationType::Overlaps => false,
+            SpatialRelationType::Equals => true,
+        },
+        SpatialPredicate::Distance(_) => false,
+        SpatialPredicate::KNearestNeighbors(_) => false,
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::spatial_predicate::{SpatialPredicate, SpatialRelationType};
+    use crate::spatial_predicate::SpatialPredicate;
     use arrow::datatypes::{DataType, Field, Schema};
     use datafusion_common::{JoinSide, ScalarValue};
+    use datafusion_execution::config::SessionConfig;
     use datafusion_expr::Operator;
     use datafusion_expr::{col, lit, ColumnarValue, Expr, ScalarUDF, SimpleScalarUDF};
     use datafusion_physical_expr::expressions::{BinaryExpr, Column, IsNotNullExpr, Literal};
     use datafusion_physical_expr::{PhysicalExpr, ScalarFunctionExpr};
     use datafusion_physical_plan::joins::utils::ColumnIndex;
     use datafusion_physical_plan::joins::utils::JoinFilter;
+    use sedona_common::add_sedona_option_extension;
     use sedona_schema::datatypes::{WKB_GEOGRAPHY, WKB_GEOMETRY};
     use std::sync::Arc;
 
@@ -2779,4 +2842,19 @@ mod tests {
         });
         assert!(!super::is_spatial_predicate(&non_spatial_and));
     }
+
+    #[test]
+    fn test_gpu_disabled_by_default() {
+        // Create default config
+        let config = SessionConfig::new();
+        let config = add_sedona_option_extension(config);
+        let options = config.options();
+
+        // GPU should be disabled by default
+        let sedona_options = options
+            .extensions
+            .get::<sedona_common::option::SedonaOptions>()
+            .unwrap();
+        assert!(!sedona_options.spatial_join.gpu.enable);
+    }
 }
diff --git a/rust/sedona-spatial-join/src/stream.rs b/rust/sedona-spatial-join/src/stream.rs
index 6cf175c28..96d728c2a 100644
--- a/rust/sedona-spatial-join/src/stream.rs
+++ b/rust/sedona-spatial-join/src/stream.rs
@@ -38,7 +38,7 @@ use std::sync::Arc;
 use crate::evaluated_batch::evaluated_batch_stream::evaluate::create_evaluated_probe_stream;
 use crate::evaluated_batch::evaluated_batch_stream::SendableEvaluatedBatchStream;
 use crate::evaluated_batch::EvaluatedBatch;
-use crate::index::SpatialIndex;
+use crate::index::spatial_index::SpatialIndexRef;
 use crate::operand_evaluator::create_operand_evaluator;
 use crate::spatial_predicate::SpatialPredicate;
 use crate::utils::join_utils::{
@@ -74,12 +74,12 @@ pub(crate) struct SpatialJoinStream {
     /// Target output batch size
     target_output_batch_size: usize,
     /// Once future for the spatial index
-    once_fut_spatial_index: OnceFut<SpatialIndex>,
+    once_fut_spatial_index: OnceFut<SpatialIndexRef>,
     /// Once async for the spatial index, will be manually disposed by the last finished stream
     /// to avoid unnecessary memory usage.
-    once_async_spatial_index: Arc<Mutex<Option<OnceAsync<SpatialIndex>>>>,
+    once_async_spatial_index: Arc<Mutex<Option<OnceAsync<SpatialIndexRef>>>>,
     /// The spatial index
-    spatial_index: Option<Arc<SpatialIndex>>,
+    spatial_index: Option<SpatialIndexRef>,
     /// The spatial predicate being evaluated
     spatial_predicate: SpatialPredicate,
 }
@@ -97,8 +97,8 @@ impl SpatialJoinStream {
         join_metrics: SpatialJoinProbeMetrics,
         options: SpatialJoinOptions,
         target_output_batch_size: usize,
-        once_fut_spatial_index: OnceFut<SpatialIndex>,
-        once_async_spatial_index: Arc<Mutex<Option<OnceAsync<SpatialIndex>>>>,
+        once_fut_spatial_index: OnceFut<SpatialIndexRef>,
+        once_async_spatial_index: Arc<Mutex<Option<OnceAsync<SpatialIndexRef>>>>,
     ) -> Self {
         let evaluator = create_operand_evaluator(on, options.clone());
         let probe_stream = create_evaluated_probe_stream(
@@ -217,8 +217,8 @@ impl SpatialJoinStream {
         &mut self,
         cx: &mut std::task::Context<'_>,
     ) -> Poll<Result<StatefulStreamResult<Option<RecordBatch>>>> {
-        let index = ready!(self.once_fut_spatial_index.get_shared(cx))?;
-        self.spatial_index = Some(index);
+        let index = ready!(self.once_fut_spatial_index.get(cx))?;
+        self.spatial_index = Some(index.clone());
         self.state = SpatialJoinStreamState::FetchProbeBatch;
         Poll::Ready(Ok(StatefulStreamResult::Continue))
     }
@@ -483,7 +483,7 @@ pub(crate) struct SpatialJoinBatchIterator {
     /// The side of the build stream, either Left or Right
     build_side: JoinSide,
     /// The spatial index reference
-    spatial_index: Arc<SpatialIndex>,
+    spatial_index: SpatialIndexRef,
     /// The probe side batch being processed
     probe_evaluated_batch: Arc<EvaluatedBatch>,
     /// Join metrics for tracking performance
@@ -610,7 +610,7 @@ pub(crate) struct SpatialJoinBatchIteratorParams {
     pub join_type: JoinType,
     pub column_indices: Vec<ColumnIndex>,
     pub build_side: JoinSide,
-    pub spatial_index: Arc<SpatialIndex>,
+    pub spatial_index: SpatialIndexRef,
     pub probe_evaluated_batch: Arc<EvaluatedBatch>,
     pub join_metrics: SpatialJoinProbeMetrics,
     pub max_batch_size: usize,
@@ -1056,7 +1056,7 @@ impl std::fmt::Debug for SpatialJoinBatchIterator {
 /// Iterator that processes unmatched build-side batches for outer joins
 pub(crate) struct UnmatchedBuildBatchIterator {
     /// The spatial index reference
-    spatial_index: Arc<SpatialIndex>,
+    spatial_index: SpatialIndexRef,
     /// Current batch index being processed
     current_batch_idx: usize,
     /// Total number of batches to process
@@ -1069,16 +1069,16 @@ pub(crate) struct UnmatchedBuildBatchIterator {
 
 impl UnmatchedBuildBatchIterator {
     pub(crate) fn new(
-        spatial_index: Arc<SpatialIndex>,
+        spatial_index: SpatialIndexRef,
         empty_right_batch: RecordBatch,
     ) -> Result<Self> {
-        let visited_left_side = spatial_index.visited_build_side();
-        let Some(vec_visited_left_side) = visited_left_side else {
+        let visited_build_side = spatial_index.visited_build_side();
+        let Some(vec_visited_build_side) = visited_build_side else {
             return sedona_internal_err!("The bitmap for visited left side is not created");
         };
 
         let total_batches = {
-            let visited_bitmaps = vec_visited_left_side.lock();
+            let visited_bitmaps = vec_visited_build_side.lock();
             visited_bitmaps.len()
         };
 
@@ -1099,16 +1099,16 @@ impl UnmatchedBuildBatchIterator {
         build_side: JoinSide,
     ) -> Result<Option<RecordBatch>> {
         while self.current_batch_idx < self.total_batches && !self.is_complete {
-            let visited_left_side = self.spatial_index.visited_build_side();
-            let Some(vec_visited_left_side) = visited_left_side else {
+            let visited_build_side = self.spatial_index.visited_build_side();
+            let Some(vec_visited_build_side) = visited_build_side else {
                 return sedona_internal_err!("The bitmap for visited left side is not created");
             };
 
             let batch = {
-                let visited_bitmaps = vec_visited_left_side.lock();
-                let visited_left_side = &visited_bitmaps[self.current_batch_idx];
+                let visited_bitmaps = vec_visited_build_side.lock();
+                let visited_build_side = &visited_bitmaps[self.current_batch_idx];
                 let (left_side, right_side) =
-                    get_final_indices_from_bit_map(visited_left_side, join_type);
+                    get_final_indices_from_bit_map(visited_build_side, join_type);
 
                 build_batch_from_indices(
                     schema,
diff --git a/rust/sedona-spatial-join/src/utils/once_fut.rs b/rust/sedona-spatial-join/src/utils/once_fut.rs
index 8e7f4d497..946520140 100644
--- a/rust/sedona-spatial-join/src/utils/once_fut.rs
+++ b/rust/sedona-spatial-join/src/utils/once_fut.rs
@@ -150,6 +150,7 @@ impl<T: 'static> OnceFut<T> {
     }
 
     /// Get shared reference to the result of the computation if it is ready, without consuming it
+    #[allow(unused)]
     pub(crate) fn get_shared(&mut self, cx: &mut Context<'_>) -> Poll<Result<Arc<T>>> {
         if let OnceFutState::Pending(fut) = &mut self.state {
             let r = ready!(fut.poll_unpin(cx));
diff --git a/rust/sedona/Cargo.toml b/rust/sedona/Cargo.toml
index 1172f77ad..106d22574 100644
--- a/rust/sedona/Cargo.toml
+++ b/rust/sedona/Cargo.toml
@@ -42,6 +42,7 @@ http = ["object_store/http"]
 proj = ["sedona-proj/proj-sys"]
 spatial-join = ["dep:sedona-spatial-join"]
 s2geography = ["dep:sedona-s2geography"]
+gpu = ["sedona-spatial-join/gpu"]
 
 [dev-dependencies]
 tempfile = { workspace = true }
diff --git a/rust/sedona/src/context.rs b/rust/sedona/src/context.rs
index 6efd3ba4f..2fbcb1bec 100644
--- a/rust/sedona/src/context.rs
+++ b/rust/sedona/src/context.rs
@@ -84,6 +84,23 @@ impl SedonaContext {
         // variables.
         let session_config = SessionConfig::from_env()?.with_information_schema(true);
         let session_config = add_sedona_option_extension(session_config);
+
+        // Auto-enable GPU when built with gpu feature
+        // The optimizer will check actual GPU availability at runtime
+        #[cfg(feature = "gpu")]
+        let session_config = {
+            use sedona_common::option::SedonaOptions;
+            let mut session_config = session_config;
+            if let Some(sedona_opts) = session_config
+                .options_mut()
+                .extensions
+                .get_mut::<SedonaOptions>()
+            {
+                sedona_opts.spatial_join.gpu.enable = true;
+            }
+            session_config
+        };
+
         let rt_builder = RuntimeEnvBuilder::new();
         let runtime_env = rt_builder.build_arc()?;