From 368e6c587e5630c273aaa339922db202398cdc2c Mon Sep 17 00:00:00 2001
From: arhik <arhik23@gmail.com>
Date: Sun, 18 Jan 2026 04:46:10 +0000
Subject: [PATCH 1/6] Add scan (prefix sum) operations support

This commit adds support for scan (parallel prefix sum) operations to cuTile,
based on the IntegerReduce branch and commit 0c9ab90.

Key changes:
- Added encode_ScanOp! to bytecode encodings for generating ScanOp bytecode
- Added encode_scan_identity_array! to reuse existing identity encoding
- Added scan intrinsic implementation using operation_identity from IntegerReduce
- Added scan() and cumsum() public APIs with proper 1-indexed to 0-indexed axis conversion
- Added comprehensive codegen tests for scan operations
- Added scankernel.jl example demonstrating CSDL scan algorithm

Features:
- Supports cumulative sum (cumsum) for float and integer types
- Supports both forward and reverse scan directions
- Reuses FloatIdentityOp and IntegerIdentityOp from IntegerReduce
- Uses operation_identity function for cleaner identity value creation
- 1-indexed axis parameter (consistent with reduce operations)
- Preserves tile shape (scan is an element-wise operation along one dimension)

Tests:
- All 142 codegen tests pass (including 6 new scan tests)
- Scankernel.jl example runs successfully with CSDL algorithm

- Clarify that it demonstrates device-side scan operation
- Add note that test might occasionally fail (race condition in phase 2 loop)

Minor comment improvements in scankernel.jl example

- Clarify that it demonstrates device-side scan operation
- Add note that test might occasionally fail (race condition in phase 2 loop)
---
 examples/scankernel.jl          | 62 ++++++++++++++++++++++++++
 src/bytecode/encodings.jl       | 72 ++++++++++++++++++++++++++++++
 src/compiler/intrinsics/core.jl | 79 ++++++++++++++++++++++++++++++++-
 src/language/operations.jl      | 13 ++++++
 test/codegen.jl                 | 74 +++++++++++++++++++++++++++++-
 5 files changed, 298 insertions(+), 2 deletions(-)
 create mode 100644 examples/scankernel.jl

diff --git a/examples/scankernel.jl b/examples/scankernel.jl
new file mode 100644
index 0000000..b0764df
--- /dev/null
+++ b/examples/scankernel.jl
@@ -0,0 +1,62 @@
+using Test
+using CUDA
+using cuTile
+import cuTile as ct
+
+function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                          tile_size::ct.Constant{Int})
+    bid = ct.bid(1)
+    tile = ct.load(a, bid, (tile_size[],))
+    result = ct.cumsum(tile, Val(1))  # Val(1) means 1st (0th) dimension for 1D tile
+    ct.store(b, bid, result)
+    return nothing
+end
+
+sz = 32
+N = 2^15
+a = CUDA.rand(Float32, N)
+b = CUDA.zeros(Float32, N)
+CUDA.@sync ct.launch(cumsum_1d_kernel, cld(length(a), sz), a, b, ct.Constant(sz))
+
+# This is supposed to be a single pass kernel but its simpler version than memory ordering version.
+# The idea is to show how device scan operation can be done.
+
+# CSDL phase 1: Intra-tile scan + store tile sums
+function cumsum_csdl_phase1(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                            tile_sums::ct.TileArray{Float32,1},
+                            tile_size::ct.Constant{Int})
+    bid = ct.bid(1)
+    tile = ct.load(a, bid, (tile_size[],))
+    result = ct.cumsum(tile, Val(1))
+    ct.store(b, bid, result)
+    tile_sum = ct.extract(result, (tile_size[],), (1,))  # Extract last element (1 element shape)
+    ct.store(tile_sums, bid, tile_sum)
+    return
+end
+
+# CSDL phase 2: Decoupled lookback to accumulate previous tile sums
+function cumsum_csdl_phase2(b::ct.TileArray{Float32,1},
+                            tile_sums::ct.TileArray{Float32,1},
+                            tile_size::ct.Constant{Int})
+    bid = ct.bid(1)
+    prev_sum = ct.zeros((tile_size[],), Float32)
+    k = Int32(bid)
+    while k > 1
+        tile_sum_k = ct.load(tile_sums, (k,), (1,))
+        prev_sum = prev_sum .+ tile_sum_k
+        k -= Int32(1)
+    end
+    tile = ct.load(b, bid, (tile_size[],))
+    result = tile .+ prev_sum
+    ct.store(b, bid, result)
+    return nothing
+end
+
+n = length(a)
+num_tiles = cld(n, sz)
+tile_sums = CUDA.zeros(Float32, num_tiles)
+CUDA.@sync ct.launch(cumsum_csdl_phase1, num_tiles, a, b, tile_sums, ct.Constant(sz))
+CUDA.@sync ct.launch(cumsum_csdl_phase2, num_tiles, b, tile_sums, ct.Constant(sz))
+
+b_cpu = cumsum(a |> collect, dims=1)
+@test isapprox(b |> collect, b_cpu) # This might fail occasionally
diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index 1e1672a..b7b5c61 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1331,6 +1331,78 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder,
     end
 end
 
+
+#=============================================================================
+ Scan operations
+=============================================================================#
+
+"""
+    encode_ScanOp!(body::Function, cb::CodeBuilder,
+                   result_types::Vector{TypeId},
+                   operands::Vector{Value},
+                   dim::Int,
+                   reverse::Bool,
+                   identities::Vector{<:IdentityOp},
+                   body_scalar_types::Vector{TypeId})
+
+Encode a ScanOp (parallel prefix sum) operation.
+
+# Arguments
+- body: Function that takes block args and yields result(s)
+- cb: CodeBuilder for the bytecode
+- result_types: Output tile types
+- operands: Input tiles to scan
+- dim: Dimension to scan along (0-indexed)
+- reverse: Whether to scan in reverse order
+- identities: Identity values for each operand (reuses IdentityOp from IntegerReduce)
+- body_scalar_types: 0D tile types for body arguments
+"""
+function encode_ScanOp!(body::Function, cb::CodeBuilder,
+                        result_types::Vector{TypeId},
+                        operands::Vector{Value},
+                        dim::Int,
+                        reverse::Bool,
+                        identities::Vector{<:IdentityOp},
+                        body_scalar_types::Vector{TypeId})
+    encode_varint!(cb.buf, Opcode.ScanOp)
+
+    # Variadic result types
+    encode_typeid_seq!(cb.buf, result_types)
+
+    # Attributes: dim (int), reverse (bool), identities (array)
+    encode_opattr_int!(cb, dim)
+    encode_opattr_bool!(cb, reverse)
+    encode_identity_array!(cb, identities)
+
+    # Variadic operands
+    encode_varint!(cb.buf, length(operands))
+    encode_operands!(cb.buf, operands)
+
+    # Number of regions
+    push!(cb.debug_attrs, cb.cur_debug_attr)
+    cb.num_ops += 1
+    encode_varint!(cb.buf, 1)  # 1 region: body
+
+    # Body region - block args are pairs of (acc, elem) for each operand
+    # The body operates on 0D tiles (scalars)
+    body_arg_types = TypeId[]
+    for scalar_type in body_scalar_types
+        push!(body_arg_types, scalar_type)  # accumulator
+        push!(body_arg_types, scalar_type)  # element
+    end
+    with_region(body, cb, body_arg_types)
+
+    # Create result values
+    num_results = length(result_types)
+    if num_results == 0
+        return Value[]
+    else
+        vals = [Value(cb.next_value_id + i) for i in 0:num_results-1]
+        cb.next_value_id += num_results
+        return vals
+    end
+end
+
 #=============================================================================
  Comparison and selection operations
 =============================================================================#
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 822a09f..0a560f4 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -702,7 +702,84 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args)
     CGVal(current_val, result_type_id, Tile{elem_type, Tuple(target_shape)}, target_shape)
 end
 
-# TODO: cuda_tile.scan
+# cuda_tile.scan
+@eval Intrinsics begin
+    """
+        scan(tile, axis_val, fn_type; reverse=false)
+
+    Parallel prefix scan along specified dimension.
+    fn_type=:add for cumulative sum (only supported operation).
+    reverse=false for forward scan, true for reverse scan.
+    Compiled to cuda_tile.scan.
+    """
+    @noinline function scan(tile::Tile{T, S}, ::Val{axis}, fn::Symbol, reverse::Bool=false) where {T, S, axis}
+        # Scan preserves shape - result has same dimensions as input
+        Tile{T, S}()
+    end
+end
+
+function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
+    cb = ctx.cb
+    tt = ctx.tt
+
+    # Get input tile
+    input_tv = emit_value!(ctx, args[1])
+    input_tv === nothing && error("Cannot resolve input tile for scan")
+
+    # Get scan axis
+    axis = @something get_constant(ctx, args[2]) error("Scan axis must be a compile-time constant")
+
+    # Get scan function type (only :add is supported)
+    fn_type = @something get_constant(ctx, args[3]) error("Scan function type must be a compile-time constant")
+    fn_type == :add || error("Only :add (cumulative sum) is currently supported for scan operations")
+
+    # Get reverse flag (optional, defaults to false)
+    reverse = false
+    if length(args) >= 4
+        reverse_val = get_constant(ctx, args[4])
+        reverse = reverse_val === true
+    end
+
+    # Get element type and shapes
+    input_type = unwrap_type(input_tv.jltype)
+    elem_type = input_type <: Tile ? input_type.parameters[1] : input_type
+    input_shape = input_tv.shape
+
+    # For scan, output shape is same as input shape
+    output_shape = copy(input_shape)
+
+    dtype = julia_to_tile_dtype!(tt, elem_type)
+
+    # Output tile type (same shape as input)
+    output_tile_type = tile_type!(tt, dtype, output_shape)
+
+    # Scalar type for scan body (0D tile)
+    scalar_tile_type = tile_type!(tt, dtype, Int[])
+
+    # Create identity value using operation_identity
+    # Reuses FloatIdentityOp and IntegerIdentityOp from IntegerReduce
+    identity = operation_identity(Val(fn_type), dtype, elem_type)
+
+    # Emit ScanOp
+    results = encode_ScanOp!(cb, [output_tile_type], [input_tv.v], axis, reverse, [identity], [scalar_tile_type]) do block_args
+        acc, elem = block_args[1], block_args[2]
+        res = encode_scan_body(cb, scalar_tile_type, acc, elem, Val(fn_type), elem_type)
+        encode_YieldOp!(cb, [res])
+    end
+
+
+    CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape)
+end
+
+# Dispatch helpers for scan body operations - dispatch on Val{fn} and elem_type
+encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat =
+    encode_AddFOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer =
+    encode_AddIOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat =
+    encode_MaxFOp!(cb, type, acc, elem)
+encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer =
+    encode_MaxIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
 
 # cuda_tile.select
 @eval Intrinsics begin
diff --git a/src/language/operations.jl b/src/language/operations.jl
index c9c9546..449a798 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -553,6 +553,19 @@ end
     Intrinsics.reduce_max(tile, Val(axis - 1))
 end
 
+# Scan (Prefix Sum) Operations
+
+@inline function scan(tile::Tile{T, S}, ::Val{axis},
+                      fn::Symbol=:add,
+                      reverse::Bool=false) where {T<:Number, S, axis}
+    Intrinsics.scan(tile, Val(axis - 1), fn, reverse)
+end
+
+@inline function cumsum(tile::Tile{T, S}, ::Val{axis},
+                        reverse::Bool=false) where {T<:Number, S, axis}
+    scan(tile, Val(axis), :add, reverse)
+end
+
 #=============================================================================
  Matrix multiplication
 =============================================================================#
diff --git a/test/codegen.jl b/test/codegen.jl
index ea596a9..0bc96f6 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -19,7 +19,79 @@
         # TODO: mmai - integer matrix multiply-accumulate
         # TODO: offset - tile offset computation
         # TODO: pack - pack tiles
-        # TODO: scan - parallel scan/prefix sum
+        @testset "scan" begin
+            # 1D cumulative sum (forward scan)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}, ct.TileArray{Float32,1,spec1d}}) do a, b
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    result = ct.scan(tile, Val(1), :add, false)
+                    ct.store(b, pid, result)
+                    return
+                end
+            end
+
+            # 2D cumulative sum along axis 1 (columns)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (4, 8))
+                    result = ct.scan(tile, Val(2), :add, false)
+                    ct.store(b, pid, result)
+                    return
+                end
+            end
+
+            # 2D cumulative sum along axis 2 (rows) - forward scan
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (4, 8))
+                    result = ct.scan(tile, Val(1), :add, false)
+                    ct.store(b, pid, result)
+                    return
+                end
+            end
+
+            # 2D cumulative sum along axis 2 (rows) - reverse scan
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (4, 8))
+                    result = ct.scan(tile, Val(1), :add, true)
+                    ct.store(b, pid, result)
+                    return
+                end
+            end
+
+            # Integer cumulative sum
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, ct.TileArray{Int32,1,spec1d}}) do a, b
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    result = ct.scan(tile, Val(1), :add, false)
+                    ct.store(b, pid, result)
+                    return
+                end
+            end
+
+            # cumsum convenience function (forward scan)
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (4, 8))
+                    result = ct.cumsum(tile, Val(2), false)
+                    ct.store(b, pid, result)
+                    return
+                end
+            end
+        end
         # TODO: unpack - unpack tiles
 
         @testset "reshape" begin

From 50c5e596991f062598f7a531e0c92c158881edf2 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 30 Jan 2026 15:43:41 +0100
Subject: [PATCH 2/6] Fix.

---
 src/bytecode/encodings.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl
index b7b5c61..bdf6db0 100644
--- a/src/bytecode/encodings.jl
+++ b/src/bytecode/encodings.jl
@@ -1342,7 +1342,7 @@ end
                    operands::Vector{Value},
                    dim::Int,
                    reverse::Bool,
-                   identities::Vector{<:IdentityOp},
+                   identities::Vector{<:IdentityVal},
                    body_scalar_types::Vector{TypeId})
 
 Encode a ScanOp (parallel prefix sum) operation.
@@ -1354,7 +1354,7 @@ Encode a ScanOp (parallel prefix sum) operation.
 - operands: Input tiles to scan
 - dim: Dimension to scan along (0-indexed)
 - reverse: Whether to scan in reverse order
-- identities: Identity values for each operand (reuses IdentityOp from IntegerReduce)
+- identities: Identity values for each operand
 - body_scalar_types: 0D tile types for body arguments
 """
 function encode_ScanOp!(body::Function, cb::CodeBuilder,
@@ -1362,7 +1362,7 @@ function encode_ScanOp!(body::Function, cb::CodeBuilder,
                         operands::Vector{Value},
                         dim::Int,
                         reverse::Bool,
-                        identities::Vector{<:IdentityOp},
+                        identities::Vector{<:IdentityVal},
                         body_scalar_types::Vector{TypeId})
     encode_varint!(cb.buf, Opcode.ScanOp)
 

From f8702d760c0bd7f2a354c1faf493ab08a0912693 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 30 Jan 2026 15:43:49 +0100
Subject: [PATCH 3/6] Deduplicate.

---
 src/compiler/intrinsics/core.jl | 42 +++++++++------------------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 0a560f4..d9a361f 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -569,7 +569,7 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol)
     results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args
         acc, elem = block_args[1], block_args[2]
 
-        res = encode_reduce_body(cb, scalar_tile_type, acc, elem, reduce_fn, elem_type)
+        res = encode_binop_body(cb, scalar_tile_type, acc, elem, reduce_fn, elem_type)
         encode_YieldOp!(cb, [res])
     end
 
@@ -609,26 +609,18 @@ operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer =
     IntegerIdentityVal(to_uint128(typemin(T)), dtype, T)
 
 #=============================================================================#
-# Reduce Body Operations
+# Binary Operation Body Encoding (shared by reduce and scan)
 #=============================================================================#
-function encode_reduce_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T
+function encode_binop_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T
     if T <: AbstractFloat
-        if op == :add
-            encode_AddFOp!(cb, type, acc, elem)
-        elseif op == :max
-            encode_MaxFOp!(cb, type, acc, elem)
-        else
-            error("Unsupported float reduction operation: $op")
-        end
-    else  # Integer
+        op == :add ? encode_AddFOp!(cb, type, acc, elem) :
+        op == :max ? encode_MaxFOp!(cb, type, acc, elem) :
+        error("Unsupported float operation: $op")
+    else
         signedness = T <: Signed ? SignednessSigned : SignednessUnsigned
-        if op == :add
-            encode_AddIOp!(cb, type, acc, elem)
-        elseif op == :max
-            encode_MaxIOp!(cb, type, acc, elem; signedness)
-        else
-            error("Unsupported integer reduction operation: $op")
-        end
+        op == :add ? encode_AddIOp!(cb, type, acc, elem) :
+        op == :max ? encode_MaxIOp!(cb, type, acc, elem; signedness) :
+        error("Unsupported integer operation: $op")
     end
 end
 
@@ -757,30 +749,18 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
     scalar_tile_type = tile_type!(tt, dtype, Int[])
 
     # Create identity value using operation_identity
-    # Reuses FloatIdentityOp and IntegerIdentityOp from IntegerReduce
     identity = operation_identity(Val(fn_type), dtype, elem_type)
 
     # Emit ScanOp
     results = encode_ScanOp!(cb, [output_tile_type], [input_tv.v], axis, reverse, [identity], [scalar_tile_type]) do block_args
         acc, elem = block_args[1], block_args[2]
-        res = encode_scan_body(cb, scalar_tile_type, acc, elem, Val(fn_type), elem_type)
+        res = encode_binop_body(cb, scalar_tile_type, acc, elem, fn_type, elem_type)
         encode_YieldOp!(cb, [res])
     end
 
-
     CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape)
 end
 
-# Dispatch helpers for scan body operations - dispatch on Val{fn} and elem_type
-encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat =
-    encode_AddFOp!(cb, type, acc, elem)
-encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer =
-    encode_AddIOp!(cb, type, acc, elem)
-encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat =
-    encode_MaxFOp!(cb, type, acc, elem)
-encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer =
-    encode_MaxIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned)
-
 # cuda_tile.select
 @eval Intrinsics begin
     """

From 25d0f445d1f7840dc981f39bec6b95d5c7d7ecda Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 30 Jan 2026 15:44:03 +0100
Subject: [PATCH 4/6] Shorten and add tests.

---
 test/codegen.jl   | 146 +++++++++++++++-------------------------------
 test/execution.jl |  49 +++++++++++++++-
 2 files changed, 95 insertions(+), 100 deletions(-)

diff --git a/test/codegen.jl b/test/codegen.jl
index 0bc96f6..58bb9aa 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -20,74 +20,58 @@
         # TODO: offset - tile offset computation
         # TODO: pack - pack tiles
         @testset "scan" begin
-            # 1D cumulative sum (forward scan)
-            @test @filecheck begin
-                @check_label "entry"
-                code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}, ct.TileArray{Float32,1,spec1d}}) do a, b
-                    pid = ct.bid(1)
-                    tile = ct.load(a, pid, (16,))
-                    result = ct.scan(tile, Val(1), :add, false)
-                    ct.store(b, pid, result)
-                    return
-                end
-            end
-
-            # 2D cumulative sum along axis 1 (columns)
-            @test @filecheck begin
-                @check_label "entry"
-                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
-                    pid = ct.bid(1)
-                    tile = ct.load(a, pid, (4, 8))
-                    result = ct.scan(tile, Val(2), :add, false)
-                    ct.store(b, pid, result)
-                    return
+            # Forward scan - float and integer types
+            for (T, spec, op_check) in [
+                (Float32, spec1d, "addf"),
+                (Int32, spec1d, "addi"),
+            ]
+                @test @filecheck begin
+                    @check_label "entry"
+                    code_tiled(Tuple{ct.TileArray{T,1,spec}}) do a
+                        pid = ct.bid(1)
+                        tile = ct.load(a, pid, (16,))
+                        @check "scan"
+                        @check op_check
+                        Base.donotdelete(ct.scan(tile, Val(1), :add, false))
+                        return
+                    end
                 end
             end
 
-            # 2D cumulative sum along axis 2 (rows) - forward scan
+            # 2D scan along different axes
             @test @filecheck begin
                 @check_label "entry"
-                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a
                     pid = ct.bid(1)
                     tile = ct.load(a, pid, (4, 8))
-                    result = ct.scan(tile, Val(1), :add, false)
-                    ct.store(b, pid, result)
+                    @check "scan"
+                    Base.donotdelete(ct.scan(tile, Val(1), :add, false))
+                    @check "scan"
+                    Base.donotdelete(ct.scan(tile, Val(2), :add, false))
                     return
                 end
             end
 
-            # 2D cumulative sum along axis 2 (rows) - reverse scan
+            # Reverse scan
             @test @filecheck begin
                 @check_label "entry"
-                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a
                     pid = ct.bid(1)
                     tile = ct.load(a, pid, (4, 8))
-                    result = ct.scan(tile, Val(1), :add, true)
-                    ct.store(b, pid, result)
+                    @check "scan"
+                    Base.donotdelete(ct.scan(tile, Val(1), :add, true))
                     return
                 end
             end
 
-            # Integer cumulative sum
+            # cumsum convenience
             @test @filecheck begin
                 @check_label "entry"
-                code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, ct.TileArray{Int32,1,spec1d}}) do a, b
-                    pid = ct.bid(1)
-                    tile = ct.load(a, pid, (16,))
-                    result = ct.scan(tile, Val(1), :add, false)
-                    ct.store(b, pid, result)
-                    return
-                end
-            end
-
-            # cumsum convenience function (forward scan)
-            @test @filecheck begin
-                @check_label "entry"
-                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a
                     pid = ct.bid(1)
                     tile = ct.load(a, pid, (4, 8))
-                    result = ct.cumsum(tile, Val(2), false)
-                    ct.store(b, pid, result)
+                    @check "scan"
+                    Base.donotdelete(ct.cumsum(tile, Val(2), false))
                     return
                 end
             end
@@ -457,61 +441,25 @@
                     return
                 end
             end
-        end
-
-        # Integer reduce_sum (Int32)
-        @test @filecheck begin
-            @check_label "entry"
-            code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b
-                pid = ct.bid(1)
-                tile = ct.load(a, pid, (4, 16))
-                @check "reduce"
-                @check "addi"
-                sums = ct.reduce_sum(tile, 2)
-                ct.store(b, pid, sums)
-                return
-            end
-        end
 
-        # Integer reduce_max (Int32)
-        @test @filecheck begin
-            @check_label "entry"
-            code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b
-                pid = ct.bid(1)
-                tile = ct.load(a, pid, (4, 16))
-                @check "reduce"
-                @check "maxi"
-                maxes = ct.reduce_max(tile, 2)
-                ct.store(b, pid, maxes)
-                return
-            end
-        end
-
-        # Unsigned reduce_sum (UInt32)
-        @test @filecheck begin
-            @check_label "entry"
-            code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b
-                pid = ct.bid(1)
-                tile = ct.load(a, pid, (4, 16))
-                @check "reduce"
-                @check "addi"
-                sums = ct.reduce_sum(tile, 2)
-                ct.store(b, pid, sums)
-                return
-            end
-        end
-
-        # Unsigned reduce_max (UInt32)
-        @test @filecheck begin
-            @check_label "entry"
-            code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b
-                pid = ct.bid(1)
-                tile = ct.load(a, pid, (4, 16))
-                @check "reduce"
-                @check "maxi"
-                maxes = ct.reduce_max(tile, 2)
-                ct.store(b, pid, maxes)
-                return
+            # Integer/unsigned reduce
+            for (T, op, op_check) in [
+                (Int32,  ct.reduce_sum, "addi"),
+                (Int32,  ct.reduce_max, "maxi"),
+                (UInt32, ct.reduce_sum, "addi"),
+                (UInt32, ct.reduce_max, "maxi"),
+            ]
+                @test @filecheck begin
+                    @check_label "entry"
+                    code_tiled(Tuple{ct.TileArray{T,2,spec2d}}) do a
+                        pid = ct.bid(1)
+                        tile = ct.load(a, pid, (4, 16))
+                        @check "reduce"
+                        @check op_check
+                        Base.donotdelete(op(tile, 2))
+                        return
+                    end
+                end
             end
         end
 
diff --git a/test/execution.jl b/test/execution.jl
index 6c4e493..bf0585f 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -1834,7 +1834,9 @@ end
         # UInt16: 1 to 2000 (32 * 2000 = 64,000, safely within UInt16 range 0 to 65,535)
         # Larger types: -1000 to 1000 (arbitrary but covers positive/negative)
         # Floats: 0 to 1 (CUDA.rand default)
-        if elType == Int8
+        if elType == UInt8
+            a_gpu = CuArray{UInt8}(rand(UInt8(0):UInt8(7), N))
+        elseif elType == Int8
             a_gpu = CuArray{Int8}(rand(-3:3, N))
         elseif elType == Int16
             a_gpu = CuArray{Int16}(rand(-800:800, N))
@@ -1870,6 +1872,51 @@ end
     end
 end
 
+# Kernel factory for scan operations
+function makeScanKernel(::Type{T}) where {T}
+    @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int})
+        ct.store(b, ct.bid(1), ct.cumsum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
+        return nothing
+    end
+    return kernel
+end
+
+@testset "1D scan (cumsum)" begin
+    TILE_SIZE = 32
+    N = 1024
+
+    TEST_TYPES = [Float16, Float32, Float64, Int32, Int64, UInt32, UInt64]
+
+    @testset "Type: $elType" for elType in TEST_TYPES
+        scanKernel = makeScanKernel(elType)
+
+        # Type-appropriate input generation (small values to avoid overflow in cumsum)
+        if elType <: Integer && elType <: Signed
+            a_gpu = CuArray{elType}(rand(elType(-3):elType(3), N))
+        elseif elType <: Integer
+            a_gpu = CuArray{elType}(rand(elType(0):elType(7), N))
+        else
+            a_gpu = CUDA.rand(elType, N)
+        end
+        b_gpu = CUDA.zeros(elType, N)
+
+        CUDA.@sync ct.launch(scanKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE))
+
+        a_cpu = Array(a_gpu)
+        b_cpu = Array(b_gpu)
+
+        # CPU reference: per-tile cumulative sum
+        a_reshaped = reshape(a_cpu, TILE_SIZE, :)
+        expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1)
+
+        if elType <: AbstractFloat
+            @test b_cpu ≈ vec(expected) rtol=1e-3
+        else
+            @test b_cpu == vec(expected)
+        end
+    end
+end
+
 @testset "transpose with hints" begin
     function transpose_with_hints(x::ct.TileArray{Float32,2},
                                   y::ct.TileArray{Float32,2})

From 5bffaa825030bb8fa8db82a7a3e3616feb954c07 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 30 Jan 2026 15:45:20 +0100
Subject: [PATCH 5/6] Simplify.

---
 src/compiler/intrinsics/conversions.jl | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl
index 5e0d5ae..8986b72 100644
--- a/src/compiler/intrinsics/conversions.jl
+++ b/src/compiler/intrinsics/conversions.jl
@@ -40,22 +40,17 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.astype), args)
     target_dtype = julia_to_tile_dtype!(tt, target_elem)
     target_tile_type = tile_type!(tt, target_dtype, tile_shape)
 
-    # Determine signedness for integer types
-    function is_signed_int(T)
-        T <: Signed || T === Int32 || T === Int64 || T === Int16 || T === Int8
-    end
-
     # Emit conversion based on source and target types
     result = if source_elem <: AbstractFloat && target_elem <: AbstractFloat
         # Float -> Float
         encode_FToFOp!(cb, target_tile_type, source.v)
     elseif source_elem <: Integer && target_elem <: AbstractFloat
         # Integer -> Float
-        signedness = is_signed_int(source_elem) ? SignednessSigned : SignednessUnsigned
+        signedness = source_elem <: Signed ? SignednessSigned : SignednessUnsigned
         encode_IToFOp!(cb, target_tile_type, source.v; signedness)
     elseif source_elem <: AbstractFloat && target_elem <: Integer
         # Float -> Integer
-        signedness = is_signed_int(target_elem) ? SignednessSigned : SignednessUnsigned
+        signedness = target_elem <: Signed ? SignednessSigned : SignednessUnsigned
         encode_FToIOp!(cb, target_tile_type, source.v; signedness)
     elseif source_elem <: Integer && target_elem <: Integer
         # Integer -> Integer
@@ -66,7 +61,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.astype), args)
             source.v
         elseif target_size > source_size
             # Extension (upsize)
-            signedness = is_signed_int(source_elem) ? SignednessSigned : SignednessUnsigned
+            signedness = source_elem <: Signed ? SignednessSigned : SignednessUnsigned
             encode_ExtIOp!(cb, target_tile_type, source.v; signedness)
         else
             # Truncation (downsize)

From 1d6ec341329e3de11f0e98c65218c339592c6148 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Fri, 30 Jan 2026 15:53:53 +0100
Subject: [PATCH 6/6] Move scan example into tests.

---
 examples/scankernel.jl |  62 -----------
 test/execution.jl      | 228 +++++++++++++++++++++++------------------
 2 files changed, 131 insertions(+), 159 deletions(-)
 delete mode 100644 examples/scankernel.jl

diff --git a/examples/scankernel.jl b/examples/scankernel.jl
deleted file mode 100644
index b0764df..0000000
--- a/examples/scankernel.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-using Test
-using CUDA
-using cuTile
-import cuTile as ct
-
-function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
-                          tile_size::ct.Constant{Int})
-    bid = ct.bid(1)
-    tile = ct.load(a, bid, (tile_size[],))
-    result = ct.cumsum(tile, Val(1))  # Val(1) means 1st (0th) dimension for 1D tile
-    ct.store(b, bid, result)
-    return nothing
-end
-
-sz = 32
-N = 2^15
-a = CUDA.rand(Float32, N)
-b = CUDA.zeros(Float32, N)
-CUDA.@sync ct.launch(cumsum_1d_kernel, cld(length(a), sz), a, b, ct.Constant(sz))
-
-# This is supposed to be a single pass kernel but its simpler version than memory ordering version.
-# The idea is to show how device scan operation can be done.
-
-# CSDL phase 1: Intra-tile scan + store tile sums
-function cumsum_csdl_phase1(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
-                            tile_sums::ct.TileArray{Float32,1},
-                            tile_size::ct.Constant{Int})
-    bid = ct.bid(1)
-    tile = ct.load(a, bid, (tile_size[],))
-    result = ct.cumsum(tile, Val(1))
-    ct.store(b, bid, result)
-    tile_sum = ct.extract(result, (tile_size[],), (1,))  # Extract last element (1 element shape)
-    ct.store(tile_sums, bid, tile_sum)
-    return
-end
-
-# CSDL phase 2: Decoupled lookback to accumulate previous tile sums
-function cumsum_csdl_phase2(b::ct.TileArray{Float32,1},
-                            tile_sums::ct.TileArray{Float32,1},
-                            tile_size::ct.Constant{Int})
-    bid = ct.bid(1)
-    prev_sum = ct.zeros((tile_size[],), Float32)
-    k = Int32(bid)
-    while k > 1
-        tile_sum_k = ct.load(tile_sums, (k,), (1,))
-        prev_sum = prev_sum .+ tile_sum_k
-        k -= Int32(1)
-    end
-    tile = ct.load(b, bid, (tile_size[],))
-    result = tile .+ prev_sum
-    ct.store(b, bid, result)
-    return nothing
-end
-
-n = length(a)
-num_tiles = cld(n, sz)
-tile_sums = CUDA.zeros(Float32, num_tiles)
-CUDA.@sync ct.launch(cumsum_csdl_phase1, num_tiles, a, b, tile_sums, ct.Constant(sz))
-CUDA.@sync ct.launch(cumsum_csdl_phase2, num_tiles, b, tile_sums, ct.Constant(sz))
-
-b_cpu = cumsum(a |> collect, dims=1)
-@test isapprox(b |> collect, b_cpu) # This might fail occasionally
diff --git a/test/execution.jl b/test/execution.jl
index bf0585f..f12603f 100644
--- a/test/execution.jl
+++ b/test/execution.jl
@@ -997,6 +997,85 @@ end
 
 end
 
+@testset "scan" begin
+
+@testset "1D cumsum (forward)" begin
+    function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                              tile_size::ct.Constant{Int})
+        bid = ct.bid(1)
+        tile = ct.load(a, bid, (tile_size[],))
+        result = ct.cumsum(tile, Val(1))
+        ct.store(b, bid, result)
+        return nothing
+    end
+
+    sz = 32
+    N = 1024
+    a = CUDA.rand(Float32, N)
+    b = CUDA.zeros(Float32, N)
+
+    ct.launch(cumsum_1d_kernel, cld(N, sz), a, b, ct.Constant(sz))
+
+    # Per-tile cumulative sum
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    a_reshaped = reshape(a_cpu, sz, :)
+    expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1)
+    @test b_cpu ≈ vec(expected) rtol=1e-3
+end
+
+@testset "2D cumsum along axis 1" begin
+    function cumsum_2d_axis1_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2})
+        pid = ct.bid(1)
+        tile = ct.load(a, (pid, 1), (4, 8))
+        result = ct.cumsum(tile, Val(1))
+        ct.store(b, (pid, 1), result)
+        return nothing
+    end
+
+    m, n = 32, 8
+    a = CUDA.rand(Float32, m, n)
+    b = CUDA.zeros(Float32, m, n)
+
+    ct.launch(cumsum_2d_axis1_kernel, cld(m, 4), a, b)
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    # cumsum along dim 1 within each 4-row tile
+    for bid in 0:(cld(m, 4)-1)
+        rows = (bid*4+1):(bid*4+4)
+        for j in 1:n
+            @test b_cpu[rows, j] ≈ accumulate(+, a_cpu[rows, j]) rtol=1e-3
+        end
+    end
+end
+
+@testset "1D reverse cumsum" begin
+    function reverse_cumsum_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1},
+                                    tile_size::ct.Constant{Int})
+        bid = ct.bid(1)
+        tile = ct.load(a, bid, (tile_size[],))
+        result = ct.scan(tile, Val(1), :add, true)
+        ct.store(b, bid, result)
+        return nothing
+    end
+
+    sz = 32
+    N = 1024
+    a = CUDA.rand(Float32, N)
+    b = CUDA.zeros(Float32, N)
+
+    ct.launch(reverse_cumsum_kernel, cld(N, sz), a, b, ct.Constant(sz))
+
+    a_cpu = Array(a)
+    b_cpu = Array(b)
+    a_reshaped = reshape(a_cpu, sz, :)
+    expected = mapslices(x -> reverse(accumulate(+, reverse(x))), a_reshaped, dims=1)
+    @test b_cpu ≈ vec(expected) rtol=1e-3
+end
+
+end
+
 @testset "scalar-tile operations" begin
 
 for (name, kernel_expr, cpu_expr) in [
@@ -1536,7 +1615,7 @@ end
     c = CUDA.zeros(Float32, n)
 
     ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2)
-    CUDA.synchronize()
+
     @test Array(c) ≈ ones(Float32, n) .* 3
 end
 
@@ -1557,7 +1636,7 @@ end
     c = CUDA.zeros(Float32, n)
 
     ct.launch(vadd_kernel_occupancy, 64, a, b, c; occupancy=4)
-    CUDA.synchronize()
+
     @test Array(c) ≈ ones(Float32, n) .* 3
 end
 
@@ -1578,7 +1657,7 @@ end
     c = CUDA.zeros(Float32, n)
 
     ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8)
-    CUDA.synchronize()
+
     @test Array(c) ≈ ones(Float32, n) .* 3
 end
 
@@ -1603,7 +1682,7 @@ end
     c = CUDA.zeros(Float32, n)
 
     ct.launch(vadd_with_load_latency, 64, a, b, c)
-    CUDA.synchronize()
+
     @test Array(c) ≈ ones(Float32, n) .* 3
 end
 
@@ -1624,7 +1703,7 @@ end
     c = CUDA.zeros(Float32, n)
 
     ct.launch(vadd_no_tma, 64, a, b, c)
-    CUDA.synchronize()
+
     @test Array(c) ≈ ones(Float32, n) .* 3
 end
 
@@ -1645,7 +1724,7 @@ end
     c = CUDA.zeros(Float32, n)
 
     ct.launch(vadd_both_load_hints, 64, a, b, c)
-    CUDA.synchronize()
+
     @test Array(c) ≈ ones(Float32, n) .* 3
 end
 
@@ -1663,7 +1742,7 @@ end
     b = CUDA.zeros(Float32, n)
 
     ct.launch(copy_with_store_latency, 64, a, b)
-    CUDA.synchronize()
+
     @test Array(b) ≈ Array(a)
 end
 
@@ -1681,7 +1760,7 @@ end
     b = CUDA.zeros(Float32, n)
 
     ct.launch(copy_no_tma_store, 64, a, b)
-    CUDA.synchronize()
+
     @test Array(b) ≈ Array(a)
 end
 
@@ -1704,7 +1783,7 @@ end
     c = CUDA.zeros(Float32, n)
 
     ct.launch(vadd_mixed_hints, 64, a, b, c)
-    CUDA.synchronize()
+
     @test Array(c) ≈ ones(Float32, n) .* 3
 end
 
@@ -1731,7 +1810,7 @@ end
     grid_x = cld(M, 32)
     grid_y = cld(N, 32)
     ct.launch(matmul_with_hints, (grid_x, grid_y, 1), a, b, c)
-    CUDA.synchronize()
+
 
     # Verify against CPU reference
     a_cpu = Array(a)
@@ -1759,7 +1838,7 @@ end
     b = CUDA.zeros(Float32, m)
 
     ct.launch(reduce_with_hints, m, a, b)
-    CUDA.synchronize()
+
 
     # Each row should be summed
     a_cpu = Array(a)
@@ -1769,71 +1848,40 @@ end
     end
 end
 
-# Kernel factory for reduce operations - extendable pattern
-function makeReduceKernel(::Type{T}, op::Symbol) where {T}
-    reduceFunc = if op == :reduce_sum
-        ct.reduce_sum
-    elseif op == :reduce_max
-        ct.reduce_max
-    # ADD NEW OPERATIONS HERE
-    # elseif op == :reduce_min
-    #     ct.reduce_min
-    # elseif op == :reduce_mul
-    #     ct.reduce_mul
+@testset "1D reduce operations" begin
+    TILE_SIZE = 32
+    N = 1024
+
+    function reduce_sum_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1},
+                           tileSz::ct.Constant{Int}) where {T}
+        ct.store(b, ct.bid(1), ct.reduce_sum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
+        return nothing
     end
 
-    @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int})
-        ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
+    function reduce_max_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1},
+                           tileSz::ct.Constant{Int}) where {T}
+        ct.store(b, ct.bid(1), ct.reduce_max(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
         return nothing
     end
-    return kernel
-end
 
-# CPU reference implementation for reduce operations - extendable pattern
-function cpu_reduce(a_reshaped::AbstractArray{T}, op::Symbol) where {T}
-    if op == :reduce_sum
-        result = sum(a_reshaped, dims=1)[:]
-        # For unsigned types, apply mask to handle overflow
-        if T <: Unsigned
+    function cpu_reduce(a_reshaped::AbstractArray{T}, op) where {T}
+        result = mapslices(op, a_reshaped, dims=1)[:]
+        # For unsigned sum, apply mask to handle overflow
+        if T <: Unsigned && op === sum
             result .= result .& typemax(T)
         end
         return result
-    elseif op == :reduce_max
-        return maximum(a_reshaped, dims=1)[:]
-    # ADD NEW OPERATIONS HERE
-    # elseif op == :reduce_min
-    #     return minimum(a_reshaped, dims=1)[:]
-    # elseif op == :reduce_mul
-    #     return prod(a_reshaped, dims=1)[:]
     end
-end
 
-@testset "1D reduce operations (extendable)" begin
-    # Test parameters - easily extendable
-    TILE_SIZE = 32
-    N = 1024
-    
-    # Supported types - add new types here
     TEST_TYPES = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64]
-    
-    # Supported operations - add new operations here
-    TEST_OPS = [:reduce_sum, :reduce_max]
-    
-    @testset "Type: $elType, Operation: $op" for elType in TEST_TYPES, op in TEST_OPS
-        # Create kernel using factory
-        reduceKernel = try
-            makeReduceKernel(elType, op)
-        catch e
-            @test_broken false
-            rethrow()
-        end
-        
-        # Generate input data with type-appropriate ranges
-        # Int8: -3 to 3 (32 * 3 = 96, safely within Int8 range -128 to 127)
-        # Int16: -800 to 800 (32 * 800 = 25,600, safely within Int16 range -32,768 to 32,767)
-        # UInt16: 1 to 2000 (32 * 2000 = 64,000, safely within UInt16 range 0 to 65,535)
-        # Larger types: -1000 to 1000 (arbitrary but covers positive/negative)
-        # Floats: 0 to 1 (CUDA.rand default)
+
+    TEST_OPS = [
+        (reduce_sum_1d, sum),
+        (reduce_max_1d, maximum),
+    ]
+
+    @testset "Type: $elType, Operation: $gpu_kernel" for elType in TEST_TYPES, (gpu_kernel, cpu_op) in TEST_OPS
+        # Generate input data with type-appropriate ranges to avoid overflow
         if elType == UInt8
             a_gpu = CuArray{UInt8}(rand(UInt8(0):UInt8(7), N))
         elseif elType == Int8
@@ -1848,22 +1896,14 @@ end
             a_gpu = CUDA.rand(elType, N)
         end
         b_gpu = CUDA.zeros(elType, cld(N, TILE_SIZE))
-        
-        # Launch kernel
-        try
-            CUDA.@sync ct.launch(reduceKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE))
-        catch e
-            @test_broken false
-            rethrow()
-        end
-        
-        # Verify results
+
+        ct.launch(gpu_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE))
+
         a_cpu = Array(a_gpu)
         b_cpu = Array(b_gpu)
         a_reshaped = reshape(a_cpu, TILE_SIZE, :)
-        cpu_result = cpu_reduce(a_reshaped, op)
-        
-        # Use appropriate comparison based on type
+        cpu_result = cpu_reduce(a_reshaped, cpu_op)
+
         if elType <: AbstractFloat
             @test b_cpu ≈ cpu_result rtol=1e-3
         else
@@ -1872,24 +1912,18 @@ end
     end
 end
 
-# Kernel factory for scan operations
-function makeScanKernel(::Type{T}) where {T}
-    @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int})
-        ct.store(b, ct.bid(1), ct.cumsum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
-        return nothing
-    end
-    return kernel
-end
-
 @testset "1D scan (cumsum)" begin
     TILE_SIZE = 32
     N = 1024
 
+    function scan_kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) where {T}
+        ct.store(b, ct.bid(1), ct.cumsum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1)))
+        return nothing
+    end
+
     TEST_TYPES = [Float16, Float32, Float64, Int32, Int64, UInt32, UInt64]
 
     @testset "Type: $elType" for elType in TEST_TYPES
-        scanKernel = makeScanKernel(elType)
-
         # Type-appropriate input generation (small values to avoid overflow in cumsum)
         if elType <: Integer && elType <: Signed
             a_gpu = CuArray{elType}(rand(elType(-3):elType(3), N))
@@ -1900,7 +1934,7 @@ end
         end
         b_gpu = CUDA.zeros(elType, N)
 
-        CUDA.@sync ct.launch(scanKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE))
+        ct.launch(scan_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE))
 
         a_cpu = Array(a_gpu)
         b_cpu = Array(b_gpu)
@@ -1936,7 +1970,7 @@ end
     y = CUDA.zeros(Float32, n, m)
 
     ct.launch(transpose_with_hints, (cld(m, tile_size), cld(n, tile_size)), x, y)
-    CUDA.synchronize()
+
 
     @test Array(y) ≈ transpose(Array(x))
 end
@@ -1967,7 +2001,7 @@ end
     d = CUDA.zeros(Float32, n)
 
     ct.launch(complex_hints_kernel, 64, a, b, c, d)
-    CUDA.synchronize()
+
     @test Array(d) ≈ ones(Float32, n) .* 6
 end
 
@@ -1988,7 +2022,7 @@ end
     c = CUDA.zeros(Float64, n)
 
     ct.launch(vadd_f64_hints, 64, a, b, c)
-    CUDA.synchronize()
+
     @test Array(c) ≈ Array(a) + Array(b)
 end
 
@@ -2009,7 +2043,7 @@ end
     c = CUDA.zeros(Float16, n)
 
     ct.launch(vadd_f16_hints, 64, a, b, c)
-    CUDA.synchronize()
+
     @test Array(c) ≈ Array(a) + Array(b)
 end
 
@@ -2028,7 +2062,7 @@ end
     b = CUDA.zeros(Float32, n)
 
     ct.launch(test_boundary_latency, 64, a, b)
-    CUDA.synchronize()
+
     @test Array(b) ≈ Array(a)
 end
 
@@ -2049,7 +2083,7 @@ end
     b = CUDA.zeros(Float32, n)
 
     ct.launch(gather_with_latency, 64, a, b)
-    CUDA.synchronize()
+
     @test Array(b) ≈ Array(a)
 end
 
@@ -2069,7 +2103,7 @@ end
     b = CUDA.zeros(Float32, n)
 
     ct.launch(scatter_with_latency, 64, a, b)
-    CUDA.synchronize()
+
     @test Array(b) ≈ Array(a)
 end