diff --git a/src/bytecode/encodings.jl b/src/bytecode/encodings.jl index 1e1672a..bdf6db0 100644 --- a/src/bytecode/encodings.jl +++ b/src/bytecode/encodings.jl @@ -1331,6 +1331,78 @@ function encode_ReduceOp!(body::Function, cb::CodeBuilder, end end + +#============================================================================= + Scan operations +=============================================================================# + +""" + encode_ScanOp!(body::Function, cb::CodeBuilder, + result_types::Vector{TypeId}, + operands::Vector{Value}, + dim::Int, + reverse::Bool, + identities::Vector{<:IdentityVal}, + body_scalar_types::Vector{TypeId}) + +Encode a ScanOp (parallel prefix sum) operation. + +# Arguments +- body: Function that takes block args and yields result(s) +- cb: CodeBuilder for the bytecode +- result_types: Output tile types +- operands: Input tiles to scan +- dim: Dimension to scan along (0-indexed) +- reverse: Whether to scan in reverse order +- identities: Identity values for each operand +- body_scalar_types: 0D tile types for body arguments +""" +function encode_ScanOp!(body::Function, cb::CodeBuilder, + result_types::Vector{TypeId}, + operands::Vector{Value}, + dim::Int, + reverse::Bool, + identities::Vector{<:IdentityVal}, + body_scalar_types::Vector{TypeId}) + encode_varint!(cb.buf, Opcode.ScanOp) + + # Variadic result types + encode_typeid_seq!(cb.buf, result_types) + + # Attributes: dim (int), reverse (bool), identities (array) + encode_opattr_int!(cb, dim) + encode_opattr_bool!(cb, reverse) + encode_identity_array!(cb, identities) + + # Variadic operands + encode_varint!(cb.buf, length(operands)) + encode_operands!(cb.buf, operands) + + # Number of regions + push!(cb.debug_attrs, cb.cur_debug_attr) + cb.num_ops += 1 + encode_varint!(cb.buf, 1) # 1 region: body + + # Body region - block args are pairs of (acc, elem) for each operand + # The body operates on 0D tiles (scalars) + body_arg_types = TypeId[] + for scalar_type in body_scalar_types + push!(body_arg_types, scalar_type) # accumulator + push!(body_arg_types, scalar_type) # element + end + with_region(body, cb, body_arg_types) + + # Create result values + num_results = length(result_types) + if num_results == 0 + return Value[] + else + vals = [Value(cb.next_value_id + i) for i in 0:num_results-1] + cb.next_value_id += num_results + return vals + end +end + #============================================================================= Comparison and selection operations =============================================================================# diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 5e0d5ae..8986b72 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -40,22 +40,17 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.astype), args) target_dtype = julia_to_tile_dtype!(tt, target_elem) target_tile_type = tile_type!(tt, target_dtype, tile_shape) - # Determine signedness for integer types - function is_signed_int(T) - T <: Signed || T === Int32 || T === Int64 || T === Int16 || T === Int8 - end - # Emit conversion based on source and target types result = if source_elem <: AbstractFloat && target_elem <: AbstractFloat # Float -> Float encode_FToFOp!(cb, target_tile_type, source.v) elseif source_elem <: Integer && target_elem <: AbstractFloat # Integer -> Float - signedness = is_signed_int(source_elem) ? SignednessSigned : SignednessUnsigned + signedness = source_elem <: Signed ? SignednessSigned : SignednessUnsigned encode_IToFOp!(cb, target_tile_type, source.v; signedness) elseif source_elem <: AbstractFloat && target_elem <: Integer # Float -> Integer - signedness = is_signed_int(target_elem) ? SignednessSigned : SignednessUnsigned + signedness = target_elem <: Signed ? SignednessSigned : SignednessUnsigned encode_FToIOp!(cb, target_tile_type, source.v; signedness) elseif source_elem <: Integer && target_elem <: Integer # Integer -> Integer @@ -66,7 +61,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.astype), args) source.v elseif target_size > source_size # Extension (upsize) - signedness = is_signed_int(source_elem) ? SignednessSigned : SignednessUnsigned + signedness = source_elem <: Signed ? SignednessSigned : SignednessUnsigned encode_ExtIOp!(cb, target_tile_type, source.v; signedness) else # Truncation (downsize) diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 822a09f..d9a361f 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -569,7 +569,7 @@ function emit_reduce!(ctx::CGCtx, args, reduce_fn::Symbol) results = encode_ReduceOp!(cb, [output_tile_type], [input_tv.v], axis, [identity], [scalar_tile_type]) do block_args acc, elem = block_args[1], block_args[2] - res = encode_reduce_body(cb, scalar_tile_type, acc, elem, reduce_fn, elem_type) + res = encode_binop_body(cb, scalar_tile_type, acc, elem, reduce_fn, elem_type) encode_YieldOp!(cb, [res]) end @@ -609,26 +609,18 @@ operation_identity(::Val{:max}, dtype, ::Type{T}) where T <: Integer = IntegerIdentityVal(to_uint128(typemin(T)), dtype, T) #=============================================================================# -# Reduce Body Operations +# Binary Operation Body Encoding (shared by reduce and scan) #=============================================================================# -function encode_reduce_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T +function encode_binop_body(cb, type, acc, elem, op::Symbol, ::Type{T}) where T if T <: AbstractFloat - if op == :add - encode_AddFOp!(cb, type, acc, elem) - elseif op == :max - encode_MaxFOp!(cb, type, acc, elem) - else - error("Unsupported float reduction operation: $op") - end - else # Integer + op == :add ? encode_AddFOp!(cb, type, acc, elem) : + op == :max ? encode_MaxFOp!(cb, type, acc, elem) : + error("Unsupported float operation: $op") + else signedness = T <: Signed ? SignednessSigned : SignednessUnsigned - if op == :add - encode_AddIOp!(cb, type, acc, elem) - elseif op == :max - encode_MaxIOp!(cb, type, acc, elem; signedness) - else - error("Unsupported integer reduction operation: $op") - end + op == :add ? encode_AddIOp!(cb, type, acc, elem) : + op == :max ? encode_MaxIOp!(cb, type, acc, elem; signedness) : + error("Unsupported integer operation: $op") end end @@ -702,7 +694,72 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args) CGVal(current_val, result_type_id, Tile{elem_type, Tuple(target_shape)}, target_shape) end -# TODO: cuda_tile.scan +# cuda_tile.scan +@eval Intrinsics begin + """ + scan(tile, axis_val, fn_type; reverse=false) + + Parallel prefix scan along specified dimension. + fn_type=:add for cumulative sum (only supported operation). + reverse=false for forward scan, true for reverse scan. + Compiled to cuda_tile.scan. + """ + @noinline function scan(tile::Tile{T, S}, ::Val{axis}, fn::Symbol, reverse::Bool=false) where {T, S, axis} + # Scan preserves shape - result has same dimensions as input + Tile{T, S}() + end +end + +function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) + cb = ctx.cb + tt = ctx.tt + + # Get input tile + input_tv = emit_value!(ctx, args[1]) + input_tv === nothing && error("Cannot resolve input tile for scan") + + # Get scan axis + axis = @something get_constant(ctx, args[2]) error("Scan axis must be a compile-time constant") + + # Get scan function type (only :add is supported) + fn_type = @something get_constant(ctx, args[3]) error("Scan function type must be a compile-time constant") + fn_type == :add || error("Only :add (cumulative sum) is currently supported for scan operations") + + # Get reverse flag (optional, defaults to false) + reverse = false + if length(args) >= 4 + reverse_val = get_constant(ctx, args[4]) + reverse = reverse_val === true + end + + # Get element type and shapes + input_type = unwrap_type(input_tv.jltype) + elem_type = input_type <: Tile ? input_type.parameters[1] : input_type + input_shape = input_tv.shape + + # For scan, output shape is same as input shape + output_shape = copy(input_shape) + + dtype = julia_to_tile_dtype!(tt, elem_type) + + # Output tile type (same shape as input) + output_tile_type = tile_type!(tt, dtype, output_shape) + + # Scalar type for scan body (0D tile) + scalar_tile_type = tile_type!(tt, dtype, Int[]) + + # Create identity value using operation_identity + identity = operation_identity(Val(fn_type), dtype, elem_type) + + # Emit ScanOp + results = encode_ScanOp!(cb, [output_tile_type], [input_tv.v], axis, reverse, [identity], [scalar_tile_type]) do block_args + acc, elem = block_args[1], block_args[2] + res = encode_binop_body(cb, scalar_tile_type, acc, elem, fn_type, elem_type) + encode_YieldOp!(cb, [res]) + end + + CGVal(results[1], output_tile_type, Tile{elem_type, Tuple(output_shape)}, output_shape) +end # cuda_tile.select @eval Intrinsics begin diff --git a/src/language/operations.jl b/src/language/operations.jl index c9c9546..449a798 100644 --- a/src/language/operations.jl +++ b/src/language/operations.jl @@ -553,6 +553,19 @@ end Intrinsics.reduce_max(tile, Val(axis - 1)) end +# Scan (Prefix Sum) Operations + +@inline function scan(tile::Tile{T, S}, ::Val{axis}, + fn::Symbol=:add, + reverse::Bool=false) where {T<:Number, S, axis} + Intrinsics.scan(tile, Val(axis - 1), fn, reverse) +end + +@inline function cumsum(tile::Tile{T, S}, ::Val{axis}, + reverse::Bool=false) where {T<:Number, S, axis} + scan(tile, Val(axis), :add, reverse) +end + #============================================================================= Matrix multiplication =============================================================================# diff --git a/test/codegen.jl b/test/codegen.jl index ea596a9..58bb9aa 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -19,7 +19,63 @@ # TODO: mmai - integer matrix multiply-accumulate # TODO: offset - tile offset computation # TODO: pack - pack tiles - # TODO: scan - parallel scan/prefix sum + @testset "scan" begin + # Forward scan - float and integer types + for (T, spec, op_check) in [ + (Float32, spec1d, "addf"), + (Int32, spec1d, "addi"), + ] + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{T,1,spec}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "scan" + @check op_check + Base.donotdelete(ct.scan(tile, Val(1), :add, false)) + return + end + end + end + + # 2D scan along different axes + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 8)) + @check "scan" + Base.donotdelete(ct.scan(tile, Val(1), :add, false)) + @check "scan" + Base.donotdelete(ct.scan(tile, Val(2), :add, false)) + return + end + end + + # Reverse scan + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 8)) + @check "scan" + Base.donotdelete(ct.scan(tile, Val(1), :add, true)) + return + end + end + + # cumsum convenience + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 8)) + @check "scan" + Base.donotdelete(ct.cumsum(tile, Val(2), false)) + return + end + end + end # TODO: unpack - unpack tiles @testset "reshape" begin @@ -385,61 +441,25 @@ return end end - end - # Integer reduce_sum (Int32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "addi" - sums = ct.reduce_sum(tile, 2) - ct.store(b, pid, sums) - return - end - end - - # Integer reduce_max (Int32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Int32,2,spec2d}, ct.TileArray{Int32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "maxi" - maxes = ct.reduce_max(tile, 2) - ct.store(b, pid, maxes) - return - end - end - - # Unsigned reduce_sum (UInt32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "addi" - sums = ct.reduce_sum(tile, 2) - ct.store(b, pid, sums) - return - end - end - - # Unsigned reduce_max (UInt32) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{UInt32,2,spec2d}, ct.TileArray{UInt32,1,spec1d}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (4, 16)) - @check "reduce" - @check "maxi" - maxes = ct.reduce_max(tile, 2) - ct.store(b, pid, maxes) - return + # Integer/unsigned reduce + for (T, op, op_check) in [ + (Int32, ct.reduce_sum, "addi"), + (Int32, ct.reduce_max, "maxi"), + (UInt32, ct.reduce_sum, "addi"), + (UInt32, ct.reduce_max, "maxi"), + ] + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{T,2,spec2d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (4, 16)) + @check "reduce" + @check op_check + Base.donotdelete(op(tile, 2)) + return + end + end end end diff --git a/test/execution.jl b/test/execution.jl index 6c4e493..f12603f 100644 --- a/test/execution.jl +++ b/test/execution.jl @@ -997,6 +997,85 @@ end end +@testset "scan" begin + +@testset "1D cumsum (forward)" begin + function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = ct.cumsum(tile, Val(1)) + ct.store(b, bid, result) + return nothing + end + + sz = 32 + N = 1024 + a = CUDA.rand(Float32, N) + b = CUDA.zeros(Float32, N) + + ct.launch(cumsum_1d_kernel, cld(N, sz), a, b, ct.Constant(sz)) + + # Per-tile cumulative sum + a_cpu = Array(a) + b_cpu = Array(b) + a_reshaped = reshape(a_cpu, sz, :) + expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) + @test b_cpu ≈ vec(expected) rtol=1e-3 +end + +@testset "2D cumsum along axis 1" begin + function cumsum_2d_axis1_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (4, 8)) + result = ct.cumsum(tile, Val(1)) + ct.store(b, (pid, 1), result) + return nothing + end + + m, n = 32, 8 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m, n) + + ct.launch(cumsum_2d_axis1_kernel, cld(m, 4), a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + # cumsum along dim 1 within each 4-row tile + for bid in 0:(cld(m, 4)-1) + rows = (bid*4+1):(bid*4+4) + for j in 1:n + @test b_cpu[rows, j] ≈ accumulate(+, a_cpu[rows, j]) rtol=1e-3 + end + end +end + +@testset "1D reverse cumsum" begin + function reverse_cumsum_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = ct.scan(tile, Val(1), :add, true) + ct.store(b, bid, result) + return nothing + end + + sz = 32 + N = 1024 + a = CUDA.rand(Float32, N) + b = CUDA.zeros(Float32, N) + + ct.launch(reverse_cumsum_kernel, cld(N, sz), a, b, ct.Constant(sz)) + + a_cpu = Array(a) + b_cpu = Array(b) + a_reshaped = reshape(a_cpu, sz, :) + expected = mapslices(x -> reverse(accumulate(+, reverse(x))), a_reshaped, dims=1) + @test b_cpu ≈ vec(expected) rtol=1e-3 +end + +end + @testset "scalar-tile operations" begin for (name, kernel_expr, cpu_expr) in [ @@ -1536,7 +1615,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1557,7 +1636,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_kernel_occupancy, 64, a, b, c; occupancy=4) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1578,7 +1657,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1603,7 +1682,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_with_load_latency, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1624,7 +1703,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_no_tma, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1645,7 +1724,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_both_load_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1663,7 +1742,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(copy_with_store_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -1681,7 +1760,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(copy_no_tma_store, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -1704,7 +1783,7 @@ end c = CUDA.zeros(Float32, n) ct.launch(vadd_mixed_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ ones(Float32, n) .* 3 end @@ -1731,7 +1810,7 @@ end grid_x = cld(M, 32) grid_y = cld(N, 32) ct.launch(matmul_with_hints, (grid_x, grid_y, 1), a, b, c) - CUDA.synchronize() + # Verify against CPU reference a_cpu = Array(a) @@ -1759,7 +1838,7 @@ end b = CUDA.zeros(Float32, m) ct.launch(reduce_with_hints, m, a, b) - CUDA.synchronize() + # Each row should be summed a_cpu = Array(a) @@ -1769,72 +1848,43 @@ end end end -# Kernel factory for reduce operations - extendable pattern -function makeReduceKernel(::Type{T}, op::Symbol) where {T} - reduceFunc = if op == :reduce_sum - ct.reduce_sum - elseif op == :reduce_max - ct.reduce_max - # ADD NEW OPERATIONS HERE - # elseif op == :reduce_min - # ct.reduce_min - # elseif op == :reduce_mul - # ct.reduce_mul +@testset "1D reduce operations" begin + TILE_SIZE = 32 + N = 1024 + + function reduce_sum_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, + tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), ct.reduce_sum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + return nothing end - @inline function kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) - ct.store(b, ct.bid(1), reduceFunc(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + function reduce_max_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, + tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), ct.reduce_max(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) return nothing end - return kernel -end -# CPU reference implementation for reduce operations - extendable pattern -function cpu_reduce(a_reshaped::AbstractArray{T}, op::Symbol) where {T} - if op == :reduce_sum - result = sum(a_reshaped, dims=1)[:] - # For unsigned types, apply mask to handle overflow - if T <: Unsigned + function cpu_reduce(a_reshaped::AbstractArray{T}, op) where {T} + result = mapslices(op, a_reshaped, dims=1)[:] + # For unsigned sum, apply mask to handle overflow + if T <: Unsigned && op === sum result .= result .& typemax(T) end return result - elseif op == :reduce_max - return maximum(a_reshaped, dims=1)[:] - # ADD NEW OPERATIONS HERE - # elseif op == :reduce_min - # return minimum(a_reshaped, dims=1)[:] - # elseif op == :reduce_mul - # return prod(a_reshaped, dims=1)[:] end -end -@testset "1D reduce operations (extendable)" begin - # Test parameters - easily extendable - TILE_SIZE = 32 - N = 1024 - - # Supported types - add new types here TEST_TYPES = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64] - - # Supported operations - add new operations here - TEST_OPS = [:reduce_sum, :reduce_max] - - @testset "Type: $elType, Operation: $op" for elType in TEST_TYPES, op in TEST_OPS - # Create kernel using factory - reduceKernel = try - makeReduceKernel(elType, op) - catch e - @test_broken false - rethrow() - end - - # Generate input data with type-appropriate ranges - # Int8: -3 to 3 (32 * 3 = 96, safely within Int8 range -128 to 127) - # Int16: -800 to 800 (32 * 800 = 25,600, safely within Int16 range -32,768 to 32,767) - # UInt16: 1 to 2000 (32 * 2000 = 64,000, safely within UInt16 range 0 to 65,535) - # Larger types: -1000 to 1000 (arbitrary but covers positive/negative) - # Floats: 0 to 1 (CUDA.rand default) - if elType == Int8 + + TEST_OPS = [ + (reduce_sum_1d, sum), + (reduce_max_1d, maximum), + ] + + @testset "Type: $elType, Operation: $gpu_kernel" for elType in TEST_TYPES, (gpu_kernel, cpu_op) in TEST_OPS + # Generate input data with type-appropriate ranges to avoid overflow + if elType == UInt8 + a_gpu = CuArray{UInt8}(rand(UInt8(0):UInt8(7), N)) + elseif elType == Int8 a_gpu = CuArray{Int8}(rand(-3:3, N)) elseif elType == Int16 a_gpu = CuArray{Int16}(rand(-800:800, N)) @@ -1846,22 +1896,14 @@ end a_gpu = CUDA.rand(elType, N) end b_gpu = CUDA.zeros(elType, cld(N, TILE_SIZE)) - - # Launch kernel - try - CUDA.@sync ct.launch(reduceKernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) - catch e - @test_broken false - rethrow() - end - - # Verify results + + ct.launch(gpu_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + a_cpu = Array(a_gpu) b_cpu = Array(b_gpu) a_reshaped = reshape(a_cpu, TILE_SIZE, :) - cpu_result = cpu_reduce(a_reshaped, op) - - # Use appropriate comparison based on type + cpu_result = cpu_reduce(a_reshaped, cpu_op) + if elType <: AbstractFloat @test b_cpu ≈ cpu_result rtol=1e-3 else @@ -1870,6 +1912,45 @@ end end end +@testset "1D scan (cumsum)" begin + TILE_SIZE = 32 + N = 1024 + + function scan_kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), ct.cumsum(ct.load(a, ct.bid(1), (tileSz[],)), Val(1))) + return nothing + end + + TEST_TYPES = [Float16, Float32, Float64, Int32, Int64, UInt32, UInt64] + + @testset "Type: $elType" for elType in TEST_TYPES + # Type-appropriate input generation (small values to avoid overflow in cumsum) + if elType <: Integer && elType <: Signed + a_gpu = CuArray{elType}(rand(elType(-3):elType(3), N)) + elseif elType <: Integer + a_gpu = CuArray{elType}(rand(elType(0):elType(7), N)) + else + a_gpu = CUDA.rand(elType, N) + end + b_gpu = CUDA.zeros(elType, N) + + ct.launch(scan_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + + a_cpu = Array(a_gpu) + b_cpu = Array(b_gpu) + + # CPU reference: per-tile cumulative sum + a_reshaped = reshape(a_cpu, TILE_SIZE, :) + expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) + + if elType <: AbstractFloat + @test b_cpu ≈ vec(expected) rtol=1e-3 + else + @test b_cpu == vec(expected) + end + end +end + @testset "transpose with hints" begin function transpose_with_hints(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) @@ -1889,7 +1970,7 @@ end y = CUDA.zeros(Float32, n, m) ct.launch(transpose_with_hints, (cld(m, tile_size), cld(n, tile_size)), x, y) - CUDA.synchronize() + @test Array(y) ≈ transpose(Array(x)) end @@ -1920,7 +2001,7 @@ end d = CUDA.zeros(Float32, n) ct.launch(complex_hints_kernel, 64, a, b, c, d) - CUDA.synchronize() + @test Array(d) ≈ ones(Float32, n) .* 6 end @@ -1941,7 +2022,7 @@ end c = CUDA.zeros(Float64, n) ct.launch(vadd_f64_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ Array(a) + Array(b) end @@ -1962,7 +2043,7 @@ end c = CUDA.zeros(Float16, n) ct.launch(vadd_f16_hints, 64, a, b, c) - CUDA.synchronize() + @test Array(c) ≈ Array(a) + Array(b) end @@ -1981,7 +2062,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(test_boundary_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -2002,7 +2083,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(gather_with_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end @@ -2022,7 +2103,7 @@ end b = CUDA.zeros(Float32, n) ct.launch(scatter_with_latency, 64, a, b) - CUDA.synchronize() + @test Array(b) ≈ Array(a) end