From 368e6c587e5630c273aaa339922db202398cdc2c Mon Sep 17 00:00:00 2001 From: arhik Date: Sun, 18 Jan 2026 04:46:10 +0000 Subject: [PATCH 1/6] Add scan (prefix sum) operations support This commit adds support for scan (parallel prefix sum) operations to cuTile, based on the IntegerReduce branch and commit 0c9ab90. Key changes: - Added encode_ScanOp! to bytecode encodings for generating ScanOp bytecode - Added encode_scan_identity_array! to reuse existing identity encoding - Added scan intrinsic implementation using operation_identity from IntegerReduce - Added scan() and cumsum() public APIs with proper 1-indexed to 0-indexed axis conversion - Added comprehensive codegen tests for scan operations - Added scankernel.jl example demonstrating CSDL scan algorithm Features: - Supports cumulative sum (cumsum) for float and integer types - Supports both forward and reverse scan directions - Reuses FloatIdentityOp and IntegerIdentityOp from IntegerReduce - Uses operation_identity function for cleaner identity value creation - 1-indexed axis parameter (consistent with reduce operations) - Preserves tile shape (scan is an element-wise operation along one dimension) Tests: - All 142 codegen tests pass (including 6 new scan tests) - Scankernel.jl example runs successfully with CSDL algorithm - Clarify that it demonstrates device-side scan operation - Add note that test might occasionally fail (race condition in phase 2 loop) Minor comment improvements in scankernel.jl example - Clarify that it demonstrates device-side scan operation - Add note that test might occasionally fail (race condition in phase 2 loop) --- examples/scankernel.jl | 62 ++++++++++++++++++++++++++ src/bytecode/encodings.jl | 72 ++++++++++++++++++++++++++++++ src/compiler/intrinsics/core.jl | 79 ++++++++++++++++++++++++++++++++- src/language/operations.jl | 13 ++++++ test/codegen.jl | 74 +++++++++++++++++++++++++++++- 5 files changed, 298 insertions(+), 2 deletions(-) create mode 100644 examples/scankernel.jl diff --git a/examples/scankernel.jl b/examples/scankernel.jl new file mode 100644 index 0000000..b0764df --- /dev/null +++ b/examples/scankernel.jl @@ -0,0 +1,62 @@ +using Test +using CUDA +using cuTile +import cuTile as ct + +function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = ct.cumsum(tile, Val(1)) # Val(1) means 1st (0th) dimension for 1D tile + ct.store(b, bid, result) + return nothing +end + +sz = 32 +N = 2^15 +a = CUDA.rand(Float32, N) +b = CUDA.zeros(Float32, N) +CUDA.@sync ct.launch(cumsum_1d_kernel, cld(length(a), sz), a, b, ct.Constant(sz)) + +# This is supposed to be a single pass kernel but its simpler version than memory ordering version. +# The idea is to show how device scan operation can be done. + +# CSDL phase 1: Intra-tile scan + store tile sums +function cumsum_csdl_phase1(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_sums::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = ct.cumsum(tile, Val(1)) + ct.store(b, bid, result) + tile_sum = ct.extract(result, (tile_size[],), (1,)) # Extract last element (1 element shape) + ct.store(tile_sums, bid, tile_sum) + return +end + +# CSDL phase 2: Decoupled lookback to accumulate previous tile sums +function cumsum_csdl_phase2(b::ct.TileArray{Float32,1}, + tile_sums::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + prev_sum = ct.zeros((tile_size[],), Float32) + k = Int32(bid) + while k > 1 + tile_sum_k = ct.load(tile_sums, (k,), (1,)) + prev_sum = prev_sum .+ tile_sum_k + k -= Int32(1) + end + tile = ct.load(b, bid, (tile_size[],)) + result = tile .+ prev_sum + ct.store(b, bid, result) + return nothing +end + +n = length(a) +num_tiles = cld(n, sz) +tile_sums = CUDA.zeros(Float32, num_tiles) +CUDA.@sync ct.launch(cumsum_csdl_phase1, num_tiles, a, b, tile_sums, ct.Constant(sz)) +CUDA.@sync ct.launch(cumsum_csdl_phase2, num_tiles, b, tile_sums, ct.Constant(sz)) + +b_cpu = cumsum(a |> collect, dims=1) +@test isapprox(b |> collect, b_cpu) # This might fail occasionally diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl index 1e1672a..b7b5c61 100644 --- a/src/bytecode/encodings.jl +++ b/src/bytecode/encodings.jl @@ -1331,6 +1331,78 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder, end end + +#============================================================================= + Scan operations +=============================================================================# + +""" + encode_ScanOp!(body::Function, cb::CodeBuilder, + result_types::Vector{TypeId}, + operands::Vector{Value}, + dim::Int, + reverse::Bool, + identities::Vector{<:IdentityOp}, + body_scalar_types::Vector{TypeId}) + +Encode a ScanOp (parallel prefix sum) operation. + +# Arguments +- body: Function that takes block args and yields result(s) +- cb: CodeBuilder for the bytecode +- result_types: Output tile types +- operands: Input tiles to scan +- dim: Dimension to scan along (0-indexed) +- reverse: Whether to scan in reverse order +- identities: Identity values for each operand (reuses IdentityOp from IntegerReduce) +- body_scalar_types: 0D tile types for body arguments +""" +function encode_ScanOp!(body::Function, cb::CodeBuilder, + result_types::Vector{TypeId}, + operands::Vector{Value}, + dim::Int, + reverse::Bool, + identities::Vector{<:IdentityOp}, + body_scalar_types::Vector{TypeId}) + encode_varint!(cb.buf, Opcode.ScanOp) + + # Variadic result types + encode_typeid_seq!(cb.buf, result_types) + + # Attributes: dim (int), reverse (bool), identities (array) + encode_opattr_int!(cb, dim) + encode_opattr_bool!(cb, reverse) + encode_identity_array!(cb, identities) + + # Variadic operands + encode_varint!(cb.buf, length(operands)) + encode_operands!(cb.buf, operands) + + # Number of regions + push!(cb.debug_attrs, cb.cur_debug_attr) + cb.num_ops += 1 + encode_varint!(cb.buf, 1) # 1 region: body + + # Body region - block args are pairs of (acc, elem) for each operand + # The body operates on 0D tiles (scalars) + body_arg_types = TypeId[] + for scalar_type in body_scalar_types + push!(body_arg_types, scalar_type) # accumulator + push!(body_arg_types, scalar_type) # element + end + with_region(body, cb, body_arg_types) + + # Create result values + num_results = length(result_types) + if num_results == 0 + return Value[] + else + vals = [Value(cb.next_value_id + i) for i in 0:num_results-1] + cb.next_value_id += num_results + return vals + end +end + #============================================================================= Comparison and selection operations =============================================================================# diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 822a09f..0a560f4 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -702,7 +702,84 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args) CGVal(current_val, result_type_id, Tile{elem_type, Tuple(target_shape)}, target_shape) end -# TODO: cuda_tile.scan +# cuda_tile.scan +@eval Intrinsics begin + """ + scan(tile, axis_val, fn_type; reverse=false) + + Parallel prefix scan along specified dimension. + fn_type=:add for cumulative sum (only supported operation). + reverse=false for forward scan, true for reverse scan. + Compiled to cuda_tile.scan. + """ + @noinline function scan(tile::Tile{T, S}, ::Val{axis}, fn::Symbol, reverse::Bool=false) where {T, S, axis} + # Scan preserves shape - result has same dimensions as input + Tile{T, S}() + end +end + +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) + cb = ctx.cb + tt = ctx.tt + + # Get input tile + input_tv = emit_value!(ctx, args[1]) + input_tv === nothing && error("Cannot resolve input tile for scan") + + # Get scan axis + axis = @something get_constant(ctx, args[2]) error("Scan axis must be a compile-time constant") + + # Get scan function type (only :add is supported) + fn_type = @something get_constant(ctx, args[3]) error("Scan function type must be a compile-time constant") + fn_type == :add || error("Only :add (cumulative sum) is currently supported for scan operations") + + # Get reverse flag (optional, defaults to false) + reverse = false + if length(args) >= 4 + reverse_val = get_constant(ctx, args[4]) + reverse = reverse_val === true + end + + # Get element type and shapes + input_type = unwrap_type(input_tv.jltype) + elem_type = input_type <: Tile ? input_type.parameters[1] : input_type + input_shape = input_tv.shape + + # For scan, output shape is same as input shape + output_shape = copy(input_shape) + + dtype = julia_to_tile_dtype!(tt, elem_type) + + # Output tile type (same shape as input) + output_tile_type = tile_type!(tt, dtype, output_shape) + + # Scalar type for scan body (0D tile) + scalar_tile_type = tile_type!(tt, dtype, Int[]) + + # Create identity value using operation_identity + # Reuses FloatIdentityOp and IntegerIdentityOp from IntegerReduce + identity = operation_identity(Val(fn_type), dtype, elem_type) + + # Emit ScanOp + results = encode_ScanOp!(cb, [output_tile_type], [input_tv.v], axis, reverse, [identity], [scalar_tile_type]) do block_args + acc, elem = block_args[1], block_args[2] + res = encode_scan_body(cb, scalar_tile_type, acc, elem, Val(fn_type), elem_type) + encode_YieldOp!(cb, [res]) + end + + + CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape) +end + +# Dispatch helpers for scan body operations - dispatch on Val{fn} and elem_type +encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat = + encode_AddFOp!(cb, type, acc, elem) +encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer = + encode_AddIOp!(cb, type, acc, elem) +encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat = + encode_MaxFOp!(cb, type, acc, elem) +encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer = + encode_MaxIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned) # cuda_tile.select @eval Intrinsics begin diff --git a/src/language/operations.jl b/src/language/operations.jl index c9c9546..449a798 100644 --- a/src/language/operations.jl +++ b/src/language/operations.jl @@ -553,6 +553,19 @@ end Intrinsics.reduce_max(tile, Val(axis - 1)) end +# Scan (Prefix Sum) Operations + +@inline function scan(tile::Tile{T, S}, ::Val{axis}, + fn::Symbol=:add, + reverse::Bool=false) where {T<:Number, S, axis} + Intrinsics.scan(tile, Val(axis - 1), fn, reverse) +end + +@inline function cumsum(tile::Tile{T, S}, ::Val{axis}, + reverse::Bool=false) where {T<:Number, S, axis} + scan(tile, Val(axis), :add, reverse) +end + #============================================================================= Matrix multiplication =============================================================================# diff --git a/test/codegen.jl b/test/codegen.jl index ea596a9..0bc96f6 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -19,7 +19,79 @@ # TODO: mmai - integer matrix multiply-accumulate # TODO: offset - tile offset computation # TODO: pack - pack tiles - # TODO: scan - parallel scan/prefix sum + @testset "scan" begin + # 1D cumulative sum (forward scan) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}, ct.TileArray{Float32,1,spec1d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + result = ct.scan(tile, Val(1), :add, false) + ct.store(b, pid, result) + return + end + end + + # 2D cumulative sum along axis 1 (columns) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 8)) + result = ct.scan(tile, Val(2), :add, false) + ct.store(b, pid, result) + return + end + end + + # 2D cumulative sum along axis 2 (rows) - forward scan + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 8)) + result = ct.scan(tile, Val(1), :add, false) + ct.store(b, pid, result) + return + end + end + + # 2D cumulative sum along axis 2 (rows) - reverse scan + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 8)) + result = ct.scan(tile, Val(1), :add, true) + ct.store(b, pid, result) + return + end + end + + # Integer cumulative sum + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, ct.TileArray{Int32,1,spec1d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + result = ct.scan(tile, Val(1), :add, false) + ct.store(b, pid, result) + return + end + end + + # cumsum convenience function (forward scan) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 8)) + result = ct.cumsum(tile, Val(2), false) + ct.store(b, pid, result) + return + end + end + end # TODO: unpack - unpack tiles @testset "reshape" begin From 50c5e596991f062598f7a531e0c92c158881edf2 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 30 Jan 2026 15:43:41 +0100 Subject: [PATCH 2/6] Fix. --- src/bytecode/encodings.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl index b7b5c61..bdf6db0 100644 --- a/src/bytecode/encodings.jl +++ b/src/bytecode/encodings.jl @@ -1342,7 +1342,7 @@ end operands::Vector{Value}, dim::Int, reverse::Bool, - identities::Vector{<:IdentityOp}, + identities::Vector{<:IdentityVal}, body_scalar_types::Vector{TypeId}) Encode a ScanOp (parallel prefix sum) operation. @@ -1354,7 +1354,7 @@ Encode a ScanOp (parallel prefix sum) operation. - operands: Input tiles to scan - dim: Dimension to scan along (0-indexed) - reverse: Whether to scan in reverse order -- identities: Identity values for each operand (reuses IdentityOp from IntegerReduce) +- identities: Identity values for each operand - body_scalar_types: 0D tile types for body arguments """ function encode_ScanOp!(body::Function, cb::CodeBuilder, @@ -1362,7 +1362,7 @@ function encode_ScanOp!(body::Function, cb::CodeBuilder, operands::Vector{Value}, dim::Int, reverse::Bool, - identities::Vector{<:IdentityOp}, + identities::Vector{<:IdentityVal}, body_scalar_types::Vector{TypeId}) encode_varint!(cb.buf, Opcode.ScanOp) From f8702d760c0bd7f2a354c1faf493ab08a0912693 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 30 Jan 2026 15:43:49 +0100 Subject: [PATCH 3/6] Deduplicate. --- src/compiler/intrinsics/core.jl | 42 +++++++++------------------------ 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 0a560f4..d9a361f 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -569,7 +569,7 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol) results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args acc, elem = block_args[1], block_args[2] - res = encode_reduce_body(cb, scalar_tile_type, acc, elem, reduce_fn, elem_type) + res = encode_binop_body(cb, scalar_tile_type, acc, elem, reduce_fn, elem_type) encode_YieldOp!(cb, [res]) end @@ -609,26 +609,18 @@ operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer = IntegerIdentityVal(to_uint128(typemin(T)), dtype, T) #=============================================================================# -# Reduce Body Operations +# Binary Operation Body Encoding (shared by reduce and scan) #=============================================================================# -function encode_reduce_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T +function encode_binop_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T if T <: AbstractFloat - if op == :add - encode_AddFOp!(cb, type, acc, elem) - elseif op == :max - encode_MaxFOp!(cb, type, acc, elem) - else - error("Unsupported float reduction operation: $op") - end - else # Integer + op == :add ? encode_AddFOp!(cb, type, acc, elem) : + op == :max ? encode_MaxFOp!(cb, type, acc, elem) : + error("Unsupported float operation: $op") + else signedness = T <: Signed ? SignednessSigned : SignednessUnsigned - if op == :add - encode_AddIOp!(cb, type, acc, elem) - elseif op == :max - encode_MaxIOp!(cb, type, acc, elem; signedness) - else - error("Unsupported integer reduction operation: $op") - end + op == :add ? encode_AddIOp!(cb, type, acc, elem) : + op == :max ? encode_MaxIOp!(cb, type, acc, elem; signedness) : + error("Unsupported integer operation: $op") end end @@ -757,30 +749,18 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) scalar_tile_type = tile_type!(tt, dtype, Int[]) # Create identity value using operation_identity - # Reuses FloatIdentityOp and IntegerIdentityOp from IntegerReduce identity = operation_identity(Val(fn_type), dtype, elem_type) # Emit ScanOp results = encode_ScanOp!(cb, [output_tile_type], [input_tv.v], axis, reverse, [identity], [scalar_tile_type]) do block_args acc, elem = block_args[1], block_args[2] - res = encode_scan_body(cb, scalar_tile_type, acc, elem, Val(fn_type), elem_type) + res = encode_binop_body(cb, scalar_tile_type, acc, elem, fn_type, elem_type) encode_YieldOp!(cb, [res]) end - CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape) end -# Dispatch helpers for scan body operations - dispatch on Val{fn} and elem_type -encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: AbstractFloat = - encode_AddFOp!(cb, type, acc, elem) -encode_scan_body(cb, type, acc, elem, ::Val{:add}, ::Type{T}) where T <: Integer = - encode_AddIOp!(cb, type, acc, elem) -encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: AbstractFloat = - encode_MaxFOp!(cb, type, acc, elem) -encode_scan_body(cb, type, acc, elem, ::Val{:max}, ::Type{T}) where T <: Integer = - encode_MaxIOp!(cb, type, acc, elem; signedness=is_signed(T) ? SignednessSigned : SignednessUnsigned) - # cuda_tile.select @eval Intrinsics begin """ From 25d0f445d1f7840dc981f39bec6b95d5c7d7ecda Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 30 Jan 2026 15:44:03 +0100 Subject: [PATCH 4/6] Shorten and add tests. --- test/codegen.jl | 146 +++++++++++++++------------------------------- test/execution.jl | 49 +++++++++++++++- 2 files changed, 95 insertions(+), 100 deletions(-) diff --git a/test/codegen.jl b/test/codegen.jl index 0bc96f6..58bb9aa 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -20,74 +20,58 @@ # TODO: offset - tile offset computation # TODO: pack - pack tiles @testset "scan" begin - # 1D cumulative sum (forward scan) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}, ct.TileArray{Float32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - result = ct.scan(tile, Val(1), :add, false) - ct.store(b, pid, result) - return - end - end - - # 2D cumulative sum along axis 1 (columns) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 8)) - result = ct.scan(tile, Val(2), :add, false) - ct.store(b, pid, result) - return + # Forward scan - float and integer types + for (T, spec, op_check) in [ + (Float32, spec1d, "addf"), + (Int32, spec1d, "addi"), + ] + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{T,1,spec}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "scan" + @check op_check + Base.donotdelete(ct.scan(tile, Val(1), :add, false)) + return + end end end - # 2D cumulative sum along axis 2 (rows) - forward scan + # 2D scan along different axes @test @filecheck begin @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a pid = ct.bid(1) tile = ct.load(a, pid, (4, 8)) - result = ct.scan(tile, Val(1), :add, false) - ct.store(b, pid, result) + @check "scan" + Base.donotdelete(ct.scan(tile, Val(1), :add, false)) + @check "scan" + Base.donotdelete(ct.scan(tile, Val(2), :add, false)) return end end - # 2D cumulative sum along axis 2 (rows) - reverse scan + # Reverse scan @test @filecheck begin @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a pid = ct.bid(1) tile = ct.load(a, pid, (4, 8)) - result = ct.scan(tile, Val(1), :add, true) - ct.store(b, pid, result) + @check "scan" + Base.donotdelete(ct.scan(tile, Val(1), :add, true)) return end end - # Integer cumulative sum + # cumsum convenience @test @filecheck begin @check_label "entry" - code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, ct.TileArray{Int32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - result = ct.scan(tile, Val(1), :add, false) - ct.store(b, pid, result) - return - end - end - - # cumsum convenience function (forward scan) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}}) do a, b + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a pid = ct.bid(1) tile = ct.load(a, pid, (4, 8)) - result = ct.cumsum(tile, Val(2), false) - ct.store(b, pid, result) + @check "scan" + Base.donotdelete(ct.cumsum(tile, Val(2), false)) return end end @@ -457,61 +441,25 @@ return end end - end - - # Integer reduce_sum (Int32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "addi" - sums = ct.reduce_sum(tile, 2) - ct.store(b, pid, sums) - return - end - end - # Integer reduce_max (Int32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "maxi" - maxes = ct.reduce_max(tile, 2) - ct.store(b, pid, maxes) - return - end - end - - # Unsigned reduce_sum (UInt32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "addi" - sums = ct.reduce_sum(tile, 2) - ct.store(b, pid, sums) - return - end - end - - # Unsigned reduce_max (UInt32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "maxi" - maxes = ct.reduce_max(tile, 2) - ct.store(b, pid, maxes) - return + # Integer/unsigned reduce + for (T, op, op_check) in [ + (Int32, ct.reduce_sum, "addi"), + (Int32, ct.reduce_max, "maxi"), + (UInt32, ct.reduce_sum, "addi"), + (UInt32, ct.reduce_max, "maxi"), + ] + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{T,2,spec2d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 16)) + @check "reduce" + @check op_check + Base.donotdelete(op(tile, 2)) + return + end + end end end diff --git a/test/execution.jl b/test/execution.jl index 6c4e493..bf0585f 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -1834,7 +1834,9 @@ end # UInt16: 1 to 2000 (32 * 2000 = 64,000, safely within UInt16 range 0 to 65,535) # Larger types: -1000 to 1000 (arbitrary but covers positive/negative) # Floats: 0 to 1 (CUDA.rand default) - if elType == Int8 + if elType == UInt8 + a_gpu = CuArray{UInt8}(rand(UInt8(0):UInt8(7), N)) + elseif elType == Int8 a_gpu = CuArray{Int8}(rand(-3:3, N)) elseif elType == Int16 a_gpu = CuArray{Int16}(rand(-800:800, N)) @@ -1870,6 +1872,51 @@ end end end +# Kernel factory for scan operations +function makeScanKernel(::Type{T}) where {T} + @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) + ct.store(b, ct.bid(1), ct.cumsum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + return nothing + end + return kernel +end + +@testset "1D scan (cumsum)" begin + TILE_SIZE = 32 + N = 1024 + + TEST_TYPES = [Float16, Float32, Float64, Int32, Int64, UInt32, UInt64] + + @testset "Type: $elType" for elType in TEST_TYPES + scanKernel = makeScanKernel(elType) + + # Type-appropriate input generation (small values to avoid overflow in cumsum) + if elType <: Integer && elType <: Signed + a_gpu = CuArray{elType}(rand(elType(-3):elType(3), N)) + elseif elType <: Integer + a_gpu = CuArray{elType}(rand(elType(0):elType(7), N)) + else + a_gpu = CUDA.rand(elType, N) + end + b_gpu = CUDA.zeros(elType, N) + + CUDA.@sync ct.launch(scanKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + + a_cpu = Array(a_gpu) + b_cpu = Array(b_gpu) + + # CPU reference: per-tile cumulative sum + a_reshaped = reshape(a_cpu, TILE_SIZE, :) + expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) + + if elType <: AbstractFloat + @test b_cpu ≈ vec(expected) rtol=1e-3 + else + @test b_cpu == vec(expected) + end + end +end + @testset "transpose with hints" begin function transpose_with_hints(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) From 5bffaa825030bb8fa8db82a7a3e3616feb954c07 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 30 Jan 2026 15:45:20 +0100 Subject: [PATCH 5/6] Simplify. --- src/compiler/intrinsics/conversions.jl | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 5e0d5ae..8986b72 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -40,22 +40,17 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.astype), args) target_dtype = julia_to_tile_dtype!(tt, target_elem) target_tile_type = tile_type!(tt, target_dtype, tile_shape) - # Determine signedness for integer types - function is_signed_int(T) - T <: Signed || T === Int32 || T === Int64 || T === Int16 || T === Int8 - end - # Emit conversion based on source and target types result = if source_elem <: AbstractFloat && target_elem <: AbstractFloat # Float -> Float encode_FToFOp!(cb, target_tile_type, source.v) elseif source_elem <: Integer && target_elem <: AbstractFloat # Integer -> Float - signedness = is_signed_int(source_elem) ? SignednessSigned : SignednessUnsigned + signedness = source_elem <: Signed ? SignednessSigned : SignednessUnsigned encode_IToFOp!(cb, target_tile_type, source.v; signedness) elseif source_elem <: AbstractFloat && target_elem <: Integer # Float -> Integer - signedness = is_signed_int(target_elem) ? SignednessSigned : SignednessUnsigned + signedness = target_elem <: Signed ? SignednessSigned : SignednessUnsigned encode_FToIOp!(cb, target_tile_type, source.v; signedness) elseif source_elem <: Integer && target_elem <: Integer # Integer -> Integer @@ -66,7 +61,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.astype), args) source.v elseif target_size > source_size # Extension (upsize) - signedness = is_signed_int(source_elem) ? SignednessSigned : SignednessUnsigned + signedness = source_elem <: Signed ? SignednessSigned : SignednessUnsigned encode_ExtIOp!(cb, target_tile_type, source.v; signedness) else # Truncation (downsize) From 1d6ec341329e3de11f0e98c65218c339592c6148 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 30 Jan 2026 15:53:53 +0100 Subject: [PATCH 6/6] Move scan example into tests. --- examples/scankernel.jl | 62 ----------- test/execution.jl | 228 +++++++++++++++++++++++------------------ 2 files changed, 131 insertions(+), 159 deletions(-) delete mode 100644 examples/scankernel.jl diff --git a/examples/scankernel.jl b/examples/scankernel.jl deleted file mode 100644 index b0764df..0000000 --- a/examples/scankernel.jl +++ /dev/null @@ -1,62 +0,0 @@ -using Test -using CUDA -using cuTile -import cuTile as ct - -function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - tile_size::ct.Constant{Int}) - bid = ct.bid(1) - tile = ct.load(a, bid, (tile_size[],)) - result = ct.cumsum(tile, Val(1)) # Val(1) means 1st (0th) dimension for 1D tile - ct.store(b, bid, result) - return nothing -end - -sz = 32 -N = 2^15 -a = CUDA.rand(Float32, N) -b = CUDA.zeros(Float32, N) -CUDA.@sync ct.launch(cumsum_1d_kernel, cld(length(a), sz), a, b, ct.Constant(sz)) - -# This is supposed to be a single pass kernel but its simpler version than memory ordering version. -# The idea is to show how device scan operation can be done. - -# CSDL phase 1: Intra-tile scan + store tile sums -function cumsum_csdl_phase1(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - tile_sums::ct.TileArray{Float32,1}, - tile_size::ct.Constant{Int}) - bid = ct.bid(1) - tile = ct.load(a, bid, (tile_size[],)) - result = ct.cumsum(tile, Val(1)) - ct.store(b, bid, result) - tile_sum = ct.extract(result, (tile_size[],), (1,)) # Extract last element (1 element shape) - ct.store(tile_sums, bid, tile_sum) - return -end - -# CSDL phase 2: Decoupled lookback to accumulate previous tile sums -function cumsum_csdl_phase2(b::ct.TileArray{Float32,1}, - tile_sums::ct.TileArray{Float32,1}, - tile_size::ct.Constant{Int}) - bid = ct.bid(1) - prev_sum = ct.zeros((tile_size[],), Float32) - k = Int32(bid) - while k > 1 - tile_sum_k = ct.load(tile_sums, (k,), (1,)) - prev_sum = prev_sum .+ tile_sum_k - k -= Int32(1) - end - tile = ct.load(b, bid, (tile_size[],)) - result = tile .+ prev_sum - ct.store(b, bid, result) - return nothing -end - -n = length(a) -num_tiles = cld(n, sz) -tile_sums = CUDA.zeros(Float32, num_tiles) -CUDA.@sync ct.launch(cumsum_csdl_phase1, num_tiles, a, b, tile_sums, ct.Constant(sz)) -CUDA.@sync ct.launch(cumsum_csdl_phase2, num_tiles, b, tile_sums, ct.Constant(sz)) - -b_cpu = cumsum(a |> collect, dims=1) -@test isapprox(b |> collect, b_cpu) # This might fail occasionally diff --git a/test/execution.jl b/test/execution.jl index bf0585f..f12603f 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -997,6 +997,85 @@ end end +@testset "scan" begin + +@testset "1D cumsum (forward)" begin + function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = ct.cumsum(tile, Val(1)) + ct.store(b, bid, result) + return nothing + end + + sz = 32 + N = 1024 + a = CUDA.rand(Float32, N) + b = CUDA.zeros(Float32, N) + + ct.launch(cumsum_1d_kernel, cld(N, sz), a, b, ct.Constant(sz)) + + # Per-tile cumulative sum + a_cpu = Array(a) + b_cpu = Array(b) + a_reshaped = reshape(a_cpu, sz, :) + expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) + @test b_cpu ≈ vec(expected) rtol=1e-3 +end + +@testset "2D cumsum along axis 1" begin + function cumsum_2d_axis1_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (4, 8)) + result = ct.cumsum(tile, Val(1)) + ct.store(b, (pid, 1), result) + return nothing + end + + m, n = 32, 8 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m, n) + + ct.launch(cumsum_2d_axis1_kernel, cld(m, 4), a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + # cumsum along dim 1 within each 4-row tile + for bid in 0:(cld(m, 4)-1) + rows = (bid*4+1):(bid*4+4) + for j in 1:n + @test b_cpu[rows, j] ≈ accumulate(+, a_cpu[rows, j]) rtol=1e-3 + end + end +end + +@testset "1D reverse cumsum" begin + function reverse_cumsum_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = ct.scan(tile, Val(1), :add, true) + ct.store(b, bid, result) + return nothing + end + + sz = 32 + N = 1024 + a = CUDA.rand(Float32, N) + b = CUDA.zeros(Float32, N) + + ct.launch(reverse_cumsum_kernel, cld(N, sz), a, b, ct.Constant(sz)) + + a_cpu = Array(a) + b_cpu = Array(b) + a_reshaped = reshape(a_cpu, sz, :) + expected = mapslices(x -> reverse(accumulate(+, reverse(x))), a_reshaped, dims=1) + @test b_cpu ≈ vec(expected) rtol=1e-3 +end + +end + @testset "scalar-tile operations" begin for (name, kernel_expr, cpu_expr) in [ @@ -1536,7 +1615,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1557,7 +1636,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_kernel_occupancy, 64, a, b, c; occupancy=4) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1578,7 +1657,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1603,7 +1682,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_with_load_latency, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1624,7 +1703,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_no_tma, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1645,7 +1724,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_both_load_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1663,7 +1742,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(copy_with_store_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -1681,7 +1760,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(copy_no_tma_store, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -1704,7 +1783,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_mixed_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1731,7 +1810,7 @@ end grid_x = cld(M, 32) grid_y = cld(N, 32) ct.launch(matmul_with_hints, (grid_x, grid_y, 1), a, b, c) - CUDA.synchronize() + # Verify against CPU reference a_cpu = Array(a) @@ -1759,7 +1838,7 @@ end b = CUDA.zeros(Float32, m) ct.launch(reduce_with_hints, m, a, b) - CUDA.synchronize() + # Each row should be summed a_cpu = Array(a) @@ -1769,71 +1848,40 @@ end end end -# Kernel factory for reduce operations - extendable pattern -function makeReduceKernel(::Type{T}, op::Symbol) where {T} - reduceFunc = if op == :reduce_sum - ct.reduce_sum - elseif op == :reduce_max - ct.reduce_max - # ADD NEW OPERATIONS HERE - # elseif op == :reduce_min - # ct.reduce_min - # elseif op == :reduce_mul - # ct.reduce_mul +@testset "1D reduce operations" begin + TILE_SIZE = 32 + N = 1024 + + function reduce_sum_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, + tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), ct.reduce_sum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + return nothing end - @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) - ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + function reduce_max_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, + tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), ct.reduce_max(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) return nothing end - return kernel -end -# CPU reference implementation for reduce operations - extendable pattern -function cpu_reduce(a_reshaped::AbstractArray{T}, op::Symbol) where {T} - if op == :reduce_sum - result = sum(a_reshaped, dims=1)[:] - # For unsigned types, apply mask to handle overflow - if T <: Unsigned + function cpu_reduce(a_reshaped::AbstractArray{T}, op) where {T} + result = mapslices(op, a_reshaped, dims=1)[:] + # For unsigned sum, apply mask to handle overflow + if T <: Unsigned && op === sum result .= result .& typemax(T) end return result - elseif op == :reduce_max - return maximum(a_reshaped, dims=1)[:] - # ADD NEW OPERATIONS HERE - # elseif op == :reduce_min - # return minimum(a_reshaped, dims=1)[:] - # elseif op == :reduce_mul - # return prod(a_reshaped, dims=1)[:] end -end -@testset "1D reduce operations (extendable)" begin - # Test parameters - easily extendable - TILE_SIZE = 32 - N = 1024 - - # Supported types - add new types here TEST_TYPES = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64] - - # Supported operations - add new operations here - TEST_OPS = [:reduce_sum, :reduce_max] - - @testset "Type: $elType, Operation: $op" for elType in TEST_TYPES, op in TEST_OPS - # Create kernel using factory - reduceKernel = try - makeReduceKernel(elType, op) - catch e - @test_broken false - rethrow() - end - - # Generate input data with type-appropriate ranges - # Int8: -3 to 3 (32 * 3 = 96, safely within Int8 range -128 to 127) - # Int16: -800 to 800 (32 * 800 = 25,600, safely within Int16 range -32,768 to 32,767) - # UInt16: 1 to 2000 (32 * 2000 = 64,000, safely within UInt16 range 0 to 65,535) - # Larger types: -1000 to 1000 (arbitrary but covers positive/negative) - # Floats: 0 to 1 (CUDA.rand default) + + TEST_OPS = [ + (reduce_sum_1d, sum), + (reduce_max_1d, maximum), + ] + + @testset "Type: $elType, Operation: $gpu_kernel" for elType in TEST_TYPES, (gpu_kernel, cpu_op) in TEST_OPS + # Generate input data with type-appropriate ranges to avoid overflow if elType == UInt8 a_gpu = CuArray{UInt8}(rand(UInt8(0):UInt8(7), N)) elseif elType == Int8 @@ -1848,22 +1896,14 @@ end a_gpu = CUDA.rand(elType, N) end b_gpu = CUDA.zeros(elType, cld(N, TILE_SIZE)) - - # Launch kernel - try - CUDA.@sync ct.launch(reduceKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) - catch e - @test_broken false - rethrow() - end - - # Verify results + + ct.launch(gpu_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + a_cpu = Array(a_gpu) b_cpu = Array(b_gpu) a_reshaped = reshape(a_cpu, TILE_SIZE, :) - cpu_result = cpu_reduce(a_reshaped, op) - - # Use appropriate comparison based on type + cpu_result = cpu_reduce(a_reshaped, cpu_op) + if elType <: AbstractFloat @test b_cpu ≈ cpu_result rtol=1e-3 else @@ -1872,24 +1912,18 @@ end end end -# Kernel factory for scan operations -function makeScanKernel(::Type{T}) where {T} - @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) - ct.store(b, ct.bid(1), ct.cumsum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) - return nothing - end - return kernel -end - @testset "1D scan (cumsum)" begin TILE_SIZE = 32 N = 1024 + function scan_kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), ct.cumsum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + return nothing + end + TEST_TYPES = [Float16, Float32, Float64, Int32, Int64, UInt32, UInt64] @testset "Type: $elType" for elType in TEST_TYPES - scanKernel = makeScanKernel(elType) - # Type-appropriate input generation (small values to avoid overflow in cumsum) if elType <: Integer && elType <: Signed a_gpu = CuArray{elType}(rand(elType(-3):elType(3), N)) @@ -1900,7 +1934,7 @@ end end b_gpu = CUDA.zeros(elType, N) - CUDA.@sync ct.launch(scanKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + ct.launch(scan_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) a_cpu = Array(a_gpu) b_cpu = Array(b_gpu) @@ -1936,7 +1970,7 @@ end y = CUDA.zeros(Float32, n, m) ct.launch(transpose_with_hints, (cld(m, tile_size), cld(n, tile_size)), x, y) - CUDA.synchronize() + @test Array(y) ≈ transpose(Array(x)) end @@ -1967,7 +2001,7 @@ end d = CUDA.zeros(Float32, n) ct.launch(complex_hints_kernel, 64, a, b, c, d) - CUDA.synchronize() + @test Array(d) ≈ ones(Float32, n) .* 6 end @@ -1988,7 +2022,7 @@ end c = CUDA.zeros(Float64, n) ct.launch(vadd_f64_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ Array(a) + Array(b) end @@ -2009,7 +2043,7 @@ end c = CUDA.zeros(Float16, n) ct.launch(vadd_f16_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ Array(a) + Array(b) end @@ -2028,7 +2062,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(test_boundary_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -2049,7 +2083,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(gather_with_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -2069,7 +2103,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(scatter_with_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end