From 1180d2d30a53590cc3306493c1256563e93ed003 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 9 Feb 2026 10:14:57 +0100 Subject: [PATCH 1/2] Split tests for better parallelism. --- test/codegen/integration.jl | 1075 +++++++ test/{codegen.jl => codegen/operations.jl} | 1075 ------- test/execution.jl | 3257 -------------------- test/execution/advanced.jl | 236 ++ test/execution/atomics.jl | 213 ++ test/execution/basic.jl | 1060 +++++++ test/execution/broadcast.jl | 765 +++++ test/execution/hints.jl | 236 ++ test/execution/reductions.jl | 569 ++++ test/runtests.jl | 2 +- 10 files changed, 4155 insertions(+), 4333 deletions(-) create mode 100644 test/codegen/integration.jl rename test/{codegen.jl => codegen/operations.jl} (57%) delete mode 100644 test/execution.jl create mode 100644 test/execution/advanced.jl create mode 100644 test/execution/atomics.jl create mode 100644 test/execution/basic.jl create mode 100644 test/execution/broadcast.jl create mode 100644 test/execution/hints.jl create mode 100644 test/execution/reductions.jl diff --git a/test/codegen/integration.jl b/test/codegen/integration.jl new file mode 100644 index 0000000..2e0cb80 --- /dev/null +++ b/test/codegen/integration.jl @@ -0,0 +1,1075 @@ + +#============================================================================= + Type Support +=============================================================================# + +@testset "Type Support" begin + spec = ct.ArraySpec{1}(16, true) + + @testset "Float32" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(b, pid, tile) + return + end + end + end + + @testset "Float64" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float64,1,spec}, ct.TileArray{Float64,1,spec}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "addf" + result = tile + tile + ct.store(b, pid, result) + return + end + end + end + + @testset "Float16" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float16,1,spec}, ct.TileArray{Float16,1,spec}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "addf" + result = tile + tile + ct.store(b, pid, result) + return + end + end + end + + @testset "Int32" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Int32,1,spec}, ct.TileArray{Int32,1,spec}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "addi" + result = tile + tile + ct.store(b, pid, result) + return + end + end + end +end + +#============================================================================= + Integration Tests +=============================================================================# + +@testset "Integration" begin + @testset "vector add kernel" begin + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}, ct.Constant{Int,16}}) do a, b, c, tile + @check "get_tile_block_id" + bid = ct.bid(1) + @check "load_view_tko" + a_tile = ct.load(a, bid, (tile[],)) + @check "load_view_tko" + b_tile = ct.load(b, bid, (tile[],)) + @check "addf" + result = a_tile + b_tile + @check "store_view_tko" + ct.store(c, bid, result) + @check "return" + return + end + end + end + + @testset "transpose kernel" begin + spec = ct.ArraySpec{2}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, ct.Constant{Int,32}, ct.Constant{Int,32}}) do x, y, tm, tn + @check "get_tile_block_id" + bidx = ct.bid(1) + bidy = ct.bid(2) + @check "load_view_tko" + input_tile = ct.load(x, (bidx, bidy), (tm[], tn[])) + @check "permute" + transposed_tile = transpose(input_tile) + @check "store_view_tko" + ct.store(y, (bidy, bidx), transposed_tile) + @check "return" + return + end + end + end + + @testset "matmul reduction loop" begin + spec = ct.ArraySpec{2}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, ct.Constant{Int,32}, ct.Constant{Int,32}, ct.Constant{Int,16}}) do A, B, C, tm, tn, tk + bid = ct.bid(1) + num_k = ct.num_tiles(A, 2, (tm[], tk[])) + acc = ct.full((tm[], tn[]), zero(Float32), Float32) + # NOTE: Uses while-loop pattern because Julia's for-loop generates + # complex iterator IR with PhiNodes that isn't fully supported. + # The structurizer upgrades this counting while-loop to a ForOp. + @check "for" + k = Int32(1) + while k <= num_k + @check "load_view_tko" + a = ct.load(A, (bid, k), (tm[], tk[]); padding_mode=ct.PaddingMode.Zero) + @check "load_view_tko" + b = ct.load(B, (k, bid), (tk[], tn[]); padding_mode=ct.PaddingMode.Zero) + @check "mma" + acc = muladd(a, b, acc) + k += Int32(1) + end + @check "store_view_tko" + ct.store(C, (bid, bid), acc) + return + end + end + end + + @testset "layernorm forward pattern (multiple sequential for loops)" begin + # This test captures the pattern from layer_norm_fwd: + # Multiple sequential for loops (mean accumulation, then output pass) + spec = ct.ArraySpec{2}(16, true) + spec1d = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, + ct.TileArray{Float32,1,spec1d}, ct.Constant{Int,16}}) do X, Y, Sum, TILE_N + bid_m = ct.bid(1) + num_tiles = ct.num_tiles(X, 2, (1, TILE_N[])) + + # First for loop: compute sum + @check "for" + acc = ct.full((1, TILE_N[]), 0.0f0, Float32) + j = Int32(1) + while j <= num_tiles + tx = ct.load(X, (bid_m, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) + acc = acc .+ tx + j += Int32(1) + end + @check "reduce" + sum_val = sum(acc; dims=2) + ct.store(Sum, bid_m, sum_val) + + # Second for loop: scale output by sum + @check "for" + j = Int32(1) + while j <= num_tiles + tx = ct.load(X, (bid_m, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) + ty = tx .* sum_val + ct.store(Y, (bid_m, j), ty) + j += Int32(1) + end + return + end + end + end + + @testset "layernorm backward pattern (atomic spinloop)" begin + # This test captures the actual pattern from layer_norm_bwd_dx_partial_dwdb: + # A for loop iterating over tiles, with a spinloop inside for atomic accumulation + spec = ct.ArraySpec{1}(16, true) + spec2d = ct.ArraySpec{2}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}, + ct.TileArray{Int32,1,spec}, Int32, ct.Constant{Int,16}}) do DW, Partial, Locks, group_bid, TILE_N + bid = ct.bid(1) + num_tiles = ct.num_tiles(DW, 2, (1, TILE_N[])) + + @check "for" + j = Int32(1) + while j <= num_tiles + # Load and compute partial result + partial = ct.load(Partial, (bid, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) + + @check "loop" + # Acquire spinlock (nested inside for loop) + while ct.atomic_cas(Locks, group_bid, Int32(0), Int32(1); + memory_order=ct.MemoryOrder.Acquire) == Int32(1) + # spin + end + + # Critical section: accumulate + @check "load_view_tko" + acc = ct.load(DW, (group_bid, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) + @check "addf" + acc = acc .+ partial + @check "store_view_tko" + ct.store(DW, (group_bid, j), acc) + + # Release spinlock + @check "atomic_rmw_tko" + ct.atomic_xchg(Locks, group_bid, Int32(0); + memory_order=ct.MemoryOrder.Release) + + j += Int32(1) + end + return + end + end + end + + @testset "nested spinloop uses correct loop index (regression test)" begin + # This test catches a bug where nested while loops inside for loops + # shadow the for loop's induction variable, causing incorrect indexing. + # The bug: store uses (group_bid, group_bid) instead of (group_bid, loopIdx) + spec = ct.ArraySpec{2}(16, true) + spec1d = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + @check "for %loopIdx in" + @check "loop iter_values" + # The store MUST use a column index derived from loopIdx, not the spinloop result + # After 1→0 index conversion, the store uses (loopIdx - 1) + @check "[[IDX:%.+]] = subi %loopIdx" + @check "store_view_tko{{.*}}[%{{[^,]+}}, [[IDX]]]" + code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Int32,1,spec1d}, + Int32, ct.Constant{Int,4}, ct.Constant{Int,4}}) do DB, Locks, num_iters, GROUP_SIZE_M, TILE_N + bid_m = ct.bid(1) + # group_bid_m: 1-indexed group ID + group_bid_m = ((bid_m - Int32(1)) % Int32(GROUP_SIZE_M[])) + Int32(1) + + j = Int32(1) + while j <= num_iters + # Nested spinloop - this must not shadow loopIdx + while ct.atomic_cas(Locks, group_bid_m, Int32(0), Int32(1); + memory_order=ct.MemoryOrder.Acquire) == Int32(1) + end + + val = ct.full((1, TILE_N[]), 1.0f0, Float32) + ct.store(DB, (group_bid_m, j), val) + + ct.atomic_xchg(Locks, group_bid_m, Int32(0); + memory_order=ct.MemoryOrder.Release) + + j += Int32(1) + end + return + end + end + end + + @testset "nested spinloop captures correct outer variable (regression test)" begin + # This test catches a bug where nested while loops inside for loops + # capture the for loop's induction variable instead of the correct outer variable. + # The bug: spinloop uses loopIdx for atomic_cas instead of group_bid_m, causing hangs. + # + # The inner loop should capture %iterArg0 (group_bid_m), NOT %loopIdx. + # Bug produces: loop iter_values(%arg9 = %loopIdx, ...) + # Correct: loop iter_values(%arg9 = %iterArg0, ...) + spec = ct.ArraySpec{2}(16, true) + spec1d = ct.ArraySpec{1}(16, true) + @test @filecheck begin + # The inner loop must NOT capture %loopIdx - it should capture %iterArg0 + # Bug: "loop iter_values(%arg9 = %loopIdx" + @check_not "iter_values({{.*}}= %loopIdx" + ct.code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Int32,1,spec1d}, + Int32, ct.Constant{Int,4}, ct.Constant{Int,4}}) do DB, Locks, num_iters, GROUP_SIZE_M, TILE_N + bid_m = ct.bid(1) + # group_bid_m: 1-indexed group ID + group_bid_m = ((bid_m - Int32(1)) % Int32(GROUP_SIZE_M[])) + Int32(1) + + j = Int32(1) + while j <= num_iters + # Spinloop should use group_bid_m for the lock, not j + while ct.atomic_cas(Locks, group_bid_m, Int32(0), Int32(1); + memory_order=ct.MemoryOrder.Acquire) == Int32(1) + end + + val = ct.full((1, TILE_N[]), 1.0f0, Float32) + ct.store(DB, (group_bid_m, j), val) + + ct.atomic_xchg(Locks, group_bid_m, Int32(0); + memory_order=ct.MemoryOrder.Release) + + j += Int32(1) + end + return + end + end + end + + @testset "nested while loops compile correctly (regression test)" begin + # Regression test: Nested while loops must compile without errors. + # Previously, nested WhileOp caused "operand index out of bounds" errors + # during bytecode parsing due to value ID conflicts in nested regions. + spec1d = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + @check "loop iter_values" + @check "loop iter_values" + code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, ct.TileArray{Int32,1,spec1d}}) do Locks1, Locks2 + idx = ct.bid(1) + + # Outer spinloop + while ct.atomic_cas(Locks1, idx, Int32(0), Int32(1); + memory_order=ct.MemoryOrder.Acquire) == Int32(1) + # Inner spinloop + while ct.atomic_cas(Locks2, idx, Int32(0), Int32(1); + memory_order=ct.MemoryOrder.Acquire) == Int32(1) + end + end + + return + end + end + end + + @testset "counting while loop with nested control flow upgrades to for (regression test)" begin + # Regression test: A counting while loop (j = 0; while j < n; j += 1) should be + # upgraded to ForOp even when it contains nested control flow (like inner loops). + # Previously, the for-loop pattern detection only searched for the step expression + # in the immediate body block, missing it when nested control flow was present. + spec1d = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + @check "for %loopIdx in" + @check "loop iter_values" + code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, Int32}) do Locks, num_iters + idx = ct.bid(1) + + j = Int32(1) + while j <= num_iters + # Inner spinloop - the presence of this nested loop shouldn't prevent + # the outer loop from being detected as a for-loop pattern + while ct.atomic_cas(Locks, idx, Int32(0), Int32(1); + memory_order=ct.MemoryOrder.Acquire) == Int32(1) + end + j += Int32(1) + end + + return + end + end + end + + @testset "Multiple loop results (regression test)" begin + # Regression test: A while loop with multiple iter_args must generate + # different result indices (%for#0, %for#1, etc.) for each result. + # Previously, all loop results resolved to %for#0, causing incorrect code. + TILE_M = 32 + TILE_N = 1024 + + # Use ArraySpec with shape_div_by to match real CuArray behavior + spec2d = ct.ArraySpec{2}(128, true, (4, 0), (32, 32)) + spec1d = ct.ArraySpec{1}(128, true, (0,), (32,)) + + @test @filecheck begin + @check_label "entry" + # The for loop should have multiple results + @check "for %loopIdx in" + # We should see both %for#0 and %for#1 used (not the same one twice) + @check "reduce %for#1" + @check "reduce %for#0" + code_tiled(Tuple{ct.TileArray{Float32, 2, spec2d}, ct.TileArray{Float32, 2, spec2d}, + ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}, + ct.Constant{Int, TILE_M}, ct.Constant{Int, TILE_N}}) do DW, DB, FINAL_DW, FINAL_DB, _TILE_M, _TILE_N + bid_n = ct.bid(1) + num_tiles = ct.num_tiles(DW, 2, (_TILE_N[], _TILE_M[])) + + dw = ct.zeros((_TILE_N[], _TILE_M[]), Float32) + db = ct.zeros((_TILE_N[], _TILE_M[]), Float32) + i = Int32(1) + while i <= num_tiles + dw = dw .+ ct.load(DW, (bid_n, i), (_TILE_N[], _TILE_M[]); padding_mode=ct.PaddingMode.Zero) + db = db .+ ct.load(DB, (bid_n, i), (_TILE_N[], _TILE_M[]); padding_mode=ct.PaddingMode.Zero) + i += Int32(1) + end + + sum_dw = sum(dw; dims=2) + sum_db = sum(db; dims=2) + + ct.store(FINAL_DW, bid_n, sum_dw) + ct.store(FINAL_DB, bid_n, sum_db) + return + end + end + end + + @testset "sequential for loops with shared accumulator value" begin + # Regression test: Two sequential for loops where the second loop both: + # 1. Uses a value computed from the first loop's reduction + # 2. Has its own accumulator (loop-carried value) + # + # This pattern appears in LayerNorm forward pass where: + # - First loop computes mean/variance + # - Second loop normalizes using those computed values while accumulating + # + # Test: Sequential for loops where the second loop uses a value computed from + # the first loop's result AND has its own loop-carried accumulator. + # This exercises correct SSA index storage across multiple ForOps. + spec = ct.ArraySpec{1}(16, true) + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}, + ct.Constant{Int,16}}) do out, inp, TILE_N + bid = ct.bid(1) + num_tiles = ct.num_tiles(inp, 1, (TILE_N[],)) + + # First loop: accumulate and reduce + @check "for" + acc = ct.zeros((TILE_N[],), Float32) + i = Int32(1) + while i <= num_tiles + tile = ct.load(inp, i, (TILE_N[],); padding_mode=ct.PaddingMode.Zero) + acc = acc .+ tile + i += Int32(1) + end + @check "reduce" + sum_val = sum(acc; dims=1) + + # Second loop: use sum_val AND accumulate + @check "for" + acc2 = ct.zeros((TILE_N[],), Float32) + i = Int32(1) + while i <= num_tiles + tile = ct.load(inp, i, (TILE_N[],); padding_mode=ct.PaddingMode.Zero) + @check "subf" + acc2 = acc2 .+ (tile .- sum_val) # Uses sum_val from first loop + i += Int32(1) + end + @check "reduce" + @check "store_view_tko" + ct.store(out, bid, sum(acc2; dims=1)) + + return + end + end + end + + #========================================================================= + Gather/Scatter Operations + =========================================================================# + @testset "gather/scatter" begin + spec = ct.ArraySpec{1}(16, true) + + @testset "1D gather" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b + pid = ct.bid(1) + # Create index tile (simple: just use arange) + @check "iota" + indices = ct.arange((16,), Int32) + # Gather from array + @check "offset" + @check "load_ptr_tko" + tile = ct.gather(a, indices) + ct.store(b, pid, tile) + return + end + end + end + + @testset "1D scatter" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b + pid = ct.bid(1) + # Load tile + tile = ct.load(a, pid, (16,)) + # Create index tile (simple: just use arange) + @check "iota" + indices = ct.arange((16,), Int32) + # Scatter to array + @check "offset" + @check "store_ptr_tko" + ct.scatter(b, indices, tile) + return + end + end + end + + @testset "1D gather with Int indices" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b + pid = ct.bid(1) + # Use Int (Int64) to test type conversion + @check "iota" + indices = ct.arange((16,), Int) + # Should convert to Int32 internally + @check "trunci" + @check "offset" + @check "load_ptr_tko" + tile = ct.gather(a, indices) + ct.store(b, pid, tile) + return + end + end + end + + @testset "1D scatter with Int indices" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + # Use Int (Int64) to test type conversion + @check "iota" + indices = ct.arange((16,), Int) + # Should convert to Int32 internally + @check "trunci" + @check "offset" + @check "store_ptr_tko" + ct.scatter(b, indices, tile) + return + end + end + end + end + + #========================================================================= + Type Validation + =========================================================================# + @testset "type validation" begin + spec = ct.ArraySpec{1}(16, true) + + @testset "binary op type mismatch errors in Julia" begin + # This should fail with an IRError, since the intrinsic + # is invoked with mismatched types (Int32 + Int64) + @test_throws ct.IRError code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a + pid = ct.bid(1) # Int32 + # Force type mismatch by calling addi with different types + result = ct.Intrinsics.addi(pid, Int64(1)) + return + end + end + end + + @testset "method error detection" begin + spec = ct.ArraySpec{1}(16, true) + + isdefined(Core, :throw_methoderror) && + @testset "mismatched tile shapes with + produces MethodError" begin + spec2d = ct.ArraySpec{2}(16, true) + @test_throws "MethodError during Tile IR compilation" begin + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a + pid = ct.bid(1) + tile_a = ct.load(a, pid, (4, 8)) + tile_b = ct.load(a, pid, (8, 4)) + Base.donotdelete(tile_a + tile_b) + return + end + end + end + + isdefined(Core, :throw_methoderror) && + @testset "no matching method produces MethodError" begin + only_ints(x::Int) = x + @test_throws "MethodError during Tile IR compilation" begin + code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a + tile = ct.load(a, ct.bid(1), (16,)) + only_ints(tile) + return + end + end + end + + @testset "unsupported function produces clear error" begin + @test_throws "Unsupported function call during Tile IR compilation" begin + code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a + tile = ct.load(a, ct.bid(1), (16,)) + print(tile) + return + end + end + end + end + + #========================================================================= + Tile Shape Validation + =========================================================================# + @testset "tile shape validation" begin + spec = ct.ArraySpec{1}(16, true) + spec2d = ct.ArraySpec{2}(16, true) + + @testset "non-power-of-2 load shape rejected" begin + @test_throws "load: tile dimension 1 must be a power of 2, got 3" begin + code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a + ct.load(a, ct.bid(1), (3,)) + end + end + end + + @testset "non-power-of-2 full shape rejected" begin + @test_throws "full: tile dimension 1 must be a power of 2, got 5" begin + code_tiled(Tuple{}) do + ct.full((5,), 0.0f0, Float32) + end + end + end + + @testset "non-power-of-2 arange shape rejected" begin + @test_throws "arange: tile dimension 1 must be a power of 2, got 7" begin + code_tiled(Tuple{}) do + ct.arange((7,), Int32) + end + end + end + + @testset "non-power-of-2 reshape target rejected" begin + @test_throws "reshape: tile dimension 1 must be a power of 2, got 3" begin + code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a + tile = ct.load(a, ct.bid(1), (16,)) + reshape(tile, (3,)) + end + end + end + + @testset "zero dimension rejected" begin + @test_throws "load: tile dimension 1 must be positive, got 0" begin + code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a + ct.load(a, ct.bid(1), (0,)) + end + end + end + + @testset "negative dimension rejected" begin + @test_throws "full: tile dimension 1 must be positive, got -4" begin + code_tiled(Tuple{}) do + ct.full((-4,), 0.0f0, Float32) + end + end + end + + @testset "valid power-of-2 shapes accepted" begin + # These should not throw - test a few key sizes + code_tiled(devnull, a -> (ct.store(a, ct.bid(1), ct.load(a, ct.bid(1), (16,))); return), + Tuple{ct.TileArray{Float32,1,spec}}) + code_tiled(devnull, a -> (ct.store(a, ct.bid(1), ct.load(a, ct.bid(1), (32,))); return), + Tuple{ct.TileArray{Float32,1,spec}}) + code_tiled(devnull, a -> (ct.store(a, ct.bid(1), ct.load(a, ct.bid(1), (128,))); return), + Tuple{ct.TileArray{Float32,1,spec}}) + end + + @testset "multi-dim: all dimensions must be pow2" begin + @test_throws "load: tile dimension 2 must be a power of 2, got 3" begin + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a + ct.load(a, (ct.bid(1), 1), (4, 3)) + end + end + end + end + + #========================================================================= + Constant Folding + =========================================================================# + @testset "constant folding" begin + spec = ct.ArraySpec{1}(16, true) + + # XXX: This test verifies that store() returns the tile to enable constant + # folding. If this test fails after removing `return tile` from store(), + # Julia's optimizer will emit subi operations for constant index math. + # See operations.jl store() for the workaround. + @testset "store with constant index folds subtraction" begin + @test @filecheck begin + @check_label "entry" + @check "load_view_tko" + # Verify no subi appears between load and store - constant 1-1 should fold to 0 + @check_not "subi" + @check "store_view_tko" + code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a + idx = Int32(1) + tile = ct.load(a, idx, (16,)) + ct.store(a, idx, tile) + return + end + end + end + end +end + +#============================================================================= + External Constants (GlobalRef handling) +=============================================================================# + +# Constants defined outside the kernel (module-level `const`) appear as GlobalRef +# nodes in Julia IR. These must emit proper ConstantOp for numeric types, +# not ghost values (which produce nothing in the bytecode). + +const _CODEGEN_TEST_FLOAT32 = Float32(1 / log(2)) +const _CODEGEN_TEST_FLOAT64 = 3.14159 + +@testset "External Constants" begin + spec1d = ct.ArraySpec{1}(16, true) + + @testset "external Float32 constant in arithmetic" begin + # Bug 1: GlobalRef for Float32 must emit ConstantOp, not a ghost value. + # Previously, emit_value!(ctx, ::GlobalRef) wrapped all values as ghosts, + # causing MulFOp to receive `nothing` instead of a bytecode Value. + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "constant " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=4) do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "occupancy only" begin + @test @filecheck begin + @check "optimization_hints=" + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=8) do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "both hints" begin + @test @filecheck begin + @check "optimization_hints=" + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120", num_ctas=4) do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "num_ctas validation" begin + # Too small + @test_throws "num_ctas must be between 1 and 16" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=0) + end + + # Too large + @test_throws "num_ctas must be between 1 and 16" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=17) + end + + # Not power of 2 + @test_throws "num_ctas must be a power of 2" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=3) + end + + @test_throws "num_ctas must be a power of 2" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=5) + end + + # Valid values should succeed + for num_ctas in [1, 2, 4, 8, 16] + @test @filecheck begin + @check "num_cta_in_cga = $(num_ctas)" + ct.code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas) + end + end + end + + @testset "occupancy validation" begin + # Too small + @test_throws "occupancy must be between 1 and 32" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=0) + end + + # Too large + @test_throws "occupancy must be between 1 and 32" begin + code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=33) + end + + # Valid boundaries + @test @filecheck begin + @check "occupancy = 1" + ct.code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=1) + end + + @test @filecheck begin + @check "occupancy = 32" + ct.code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=32) + end + end +end + +#============================================================================= + Load / Store Hints (operation-level optimization hints) +=============================================================================# + +@testset "Load / Store Optimization Hints" begin + # Common ArraySpecs for tests + spec1d = ct.ArraySpec{1}(16, true) + + @testset "latency only on load" begin + @test @filecheck begin + @check "load_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); latency=5) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "allow_tma=false only on load" begin + @test @filecheck begin + @check "load_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); allow_tma=false) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "both hints on load" begin + @test @filecheck begin + @check "load_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); latency=7, allow_tma=false) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "latency only on store" begin + @test @filecheck begin + @check "store_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t; latency=3) + return nothing + end + end + end + + @testset "allow_tma=false only on store" begin + @test @filecheck begin + @check "store_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t; allow_tma=false) + return nothing + end + end + end + + @testset "both hints on store" begin + @test @filecheck begin + @check "store_view_tko" + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,)) + ct.store(a, pid, t; allow_tma=false, latency=2) + return nothing + end + end + end + + @testset "latency validation" begin + @test_throws "latency must be between 1 and 10" begin + code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + ct.load(a, pid, (16,); latency=11) + end + end + + @test @filecheck begin + @check "optimization_hints = " + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a + pid = ct.bid(1) + t = ct.load(a, pid, (16,); latency=8) + ct.store(a, pid, t) + return nothing + end + end + end + + @testset "multiple operations with mixed hints" begin + @test @filecheck begin + # First load with latency + @check "load_view_tko" + @check "optimization_hints = " + # Second load with allow_tma=false + @check "load_view_tko" + @check "optimization_hints = " + # Third load with no hints + @check "load_view_tko" + @check_not "optimization_hints" + ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, + ct.TileArray{Float32, 1, spec1d}, + ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b, c + pid = ct.bid(1) + t1 = ct.load(a, pid, (16,); latency=5) + t2 = ct.load(b, pid, (16,); allow_tma=false) + t3 = ct.load(c, pid, (16,)) + result = t1 + t2 + t3 + ct.store(a, pid, result) + return nothing + end + end + end + + # Pointer-based operations (gather/scatter) with latency hints + @testset "gather with latency hint" begin + @test @filecheck begin + @check "load_ptr_tko" + @check "optimization_hints = " + code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b + pid = ct.bid(1) + indices = ct.arange((16,), Int32) + tile = ct.gather(a, indices; latency=3) + ct.store(b, pid, tile) + return nothing + end + end + end + + @testset "scatter with latency hint" begin + @test @filecheck begin + @check "store_ptr_tko" + @check "optimization_hints = " + code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + indices = ct.arange((16,), Int32) + ct.scatter(b, indices, tile; latency=5) + return nothing + end + end + end +end diff --git a/test/codegen.jl b/test/codegen/operations.jl similarity index 57% rename from test/codegen.jl rename to test/codegen/operations.jl index cb9c2fc..bd822f2 100644 --- a/test/codegen.jl +++ b/test/codegen/operations.jl @@ -1516,1078 +1516,3 @@ # TODO: assume - optimization hint # TODO: print_tko - debug printing end - -#============================================================================= - Type Support -=============================================================================# - -@testset "Type Support" begin - spec = ct.ArraySpec{1}(16, true) - - @testset "Float32" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - ct.store(b, pid, tile) - return - end - end - end - - @testset "Float64" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float64,1,spec}, ct.TileArray{Float64,1,spec}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - @check "addf" - result = tile + tile - ct.store(b, pid, result) - return - end - end - end - - @testset "Float16" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float16,1,spec}, ct.TileArray{Float16,1,spec}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - @check "addf" - result = tile + tile - ct.store(b, pid, result) - return - end - end - end - - @testset "Int32" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Int32,1,spec}, ct.TileArray{Int32,1,spec}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - @check "addi" - result = tile + tile - ct.store(b, pid, result) - return - end - end - end -end - -#============================================================================= - Integration Tests -=============================================================================# - -@testset "Integration" begin - @testset "vector add kernel" begin - spec = ct.ArraySpec{1}(16, true) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}, ct.Constant{Int,16}}) do a, b, c, tile - @check "get_tile_block_id" - bid = ct.bid(1) - @check "load_view_tko" - a_tile = ct.load(a, bid, (tile[],)) - @check "load_view_tko" - b_tile = ct.load(b, bid, (tile[],)) - @check "addf" - result = a_tile + b_tile - @check "store_view_tko" - ct.store(c, bid, result) - @check "return" - return - end - end - end - - @testset "transpose kernel" begin - spec = ct.ArraySpec{2}(16, true) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, ct.Constant{Int,32}, ct.Constant{Int,32}}) do x, y, tm, tn - @check "get_tile_block_id" - bidx = ct.bid(1) - bidy = ct.bid(2) - @check "load_view_tko" - input_tile = ct.load(x, (bidx, bidy), (tm[], tn[])) - @check "permute" - transposed_tile = transpose(input_tile) - @check "store_view_tko" - ct.store(y, (bidy, bidx), transposed_tile) - @check "return" - return - end - end - end - - @testset "matmul reduction loop" begin - spec = ct.ArraySpec{2}(16, true) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, ct.Constant{Int,32}, ct.Constant{Int,32}, ct.Constant{Int,16}}) do A, B, C, tm, tn, tk - bid = ct.bid(1) - num_k = ct.num_tiles(A, 2, (tm[], tk[])) - acc = ct.full((tm[], tn[]), zero(Float32), Float32) - # NOTE: Uses while-loop pattern because Julia's for-loop generates - # complex iterator IR with PhiNodes that isn't fully supported. - # The structurizer upgrades this counting while-loop to a ForOp. - @check "for" - k = Int32(1) - while k <= num_k - @check "load_view_tko" - a = ct.load(A, (bid, k), (tm[], tk[]); padding_mode=ct.PaddingMode.Zero) - @check "load_view_tko" - b = ct.load(B, (k, bid), (tk[], tn[]); padding_mode=ct.PaddingMode.Zero) - @check "mma" - acc = muladd(a, b, acc) - k += Int32(1) - end - @check "store_view_tko" - ct.store(C, (bid, bid), acc) - return - end - end - end - - @testset "layernorm forward pattern (multiple sequential for loops)" begin - # This test captures the pattern from layer_norm_fwd: - # Multiple sequential for loops (mean accumulation, then output pass) - spec = ct.ArraySpec{2}(16, true) - spec1d = ct.ArraySpec{1}(16, true) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}, - ct.TileArray{Float32,1,spec1d}, ct.Constant{Int,16}}) do X, Y, Sum, TILE_N - bid_m = ct.bid(1) - num_tiles = ct.num_tiles(X, 2, (1, TILE_N[])) - - # First for loop: compute sum - @check "for" - acc = ct.full((1, TILE_N[]), 0.0f0, Float32) - j = Int32(1) - while j <= num_tiles - tx = ct.load(X, (bid_m, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) - acc = acc .+ tx - j += Int32(1) - end - @check "reduce" - sum_val = sum(acc; dims=2) - ct.store(Sum, bid_m, sum_val) - - # Second for loop: scale output by sum - @check "for" - j = Int32(1) - while j <= num_tiles - tx = ct.load(X, (bid_m, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) - ty = tx .* sum_val - ct.store(Y, (bid_m, j), ty) - j += Int32(1) - end - return - end - end - end - - @testset "layernorm backward pattern (atomic spinloop)" begin - # This test captures the actual pattern from layer_norm_bwd_dx_partial_dwdb: - # A for loop iterating over tiles, with a spinloop inside for atomic accumulation - spec = ct.ArraySpec{1}(16, true) - spec2d = ct.ArraySpec{2}(16, true) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}, ct.TileArray{Float32,2,spec2d}, - ct.TileArray{Int32,1,spec}, Int32, ct.Constant{Int,16}}) do DW, Partial, Locks, group_bid, TILE_N - bid = ct.bid(1) - num_tiles = ct.num_tiles(DW, 2, (1, TILE_N[])) - - @check "for" - j = Int32(1) - while j <= num_tiles - # Load and compute partial result - partial = ct.load(Partial, (bid, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) - - @check "loop" - # Acquire spinlock (nested inside for loop) - while ct.atomic_cas(Locks, group_bid, Int32(0), Int32(1); - memory_order=ct.MemoryOrder.Acquire) == Int32(1) - # spin - end - - # Critical section: accumulate - @check "load_view_tko" - acc = ct.load(DW, (group_bid, j), (1, TILE_N[]); padding_mode=ct.PaddingMode.Zero) - @check "addf" - acc = acc .+ partial - @check "store_view_tko" - ct.store(DW, (group_bid, j), acc) - - # Release spinlock - @check "atomic_rmw_tko" - ct.atomic_xchg(Locks, group_bid, Int32(0); - memory_order=ct.MemoryOrder.Release) - - j += Int32(1) - end - return - end - end - end - - @testset "nested spinloop uses correct loop index (regression test)" begin - # This test catches a bug where nested while loops inside for loops - # shadow the for loop's induction variable, causing incorrect indexing. - # The bug: store uses (group_bid, group_bid) instead of (group_bid, loopIdx) - spec = ct.ArraySpec{2}(16, true) - spec1d = ct.ArraySpec{1}(16, true) - @test @filecheck begin - @check_label "entry" - @check "for %loopIdx in" - @check "loop iter_values" - # The store MUST use a column index derived from loopIdx, not the spinloop result - # After 1→0 index conversion, the store uses (loopIdx - 1) - @check "[[IDX:%.+]] = subi %loopIdx" - @check "store_view_tko{{.*}}[%{{[^,]+}}, [[IDX]]]" - code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Int32,1,spec1d}, - Int32, ct.Constant{Int,4}, ct.Constant{Int,4}}) do DB, Locks, num_iters, GROUP_SIZE_M, TILE_N - bid_m = ct.bid(1) - # group_bid_m: 1-indexed group ID - group_bid_m = ((bid_m - Int32(1)) % Int32(GROUP_SIZE_M[])) + Int32(1) - - j = Int32(1) - while j <= num_iters - # Nested spinloop - this must not shadow loopIdx - while ct.atomic_cas(Locks, group_bid_m, Int32(0), Int32(1); - memory_order=ct.MemoryOrder.Acquire) == Int32(1) - end - - val = ct.full((1, TILE_N[]), 1.0f0, Float32) - ct.store(DB, (group_bid_m, j), val) - - ct.atomic_xchg(Locks, group_bid_m, Int32(0); - memory_order=ct.MemoryOrder.Release) - - j += Int32(1) - end - return - end - end - end - - @testset "nested spinloop captures correct outer variable (regression test)" begin - # This test catches a bug where nested while loops inside for loops - # capture the for loop's induction variable instead of the correct outer variable. - # The bug: spinloop uses loopIdx for atomic_cas instead of group_bid_m, causing hangs. - # - # The inner loop should capture %iterArg0 (group_bid_m), NOT %loopIdx. - # Bug produces: loop iter_values(%arg9 = %loopIdx, ...) - # Correct: loop iter_values(%arg9 = %iterArg0, ...) - spec = ct.ArraySpec{2}(16, true) - spec1d = ct.ArraySpec{1}(16, true) - @test @filecheck begin - # The inner loop must NOT capture %loopIdx - it should capture %iterArg0 - # Bug: "loop iter_values(%arg9 = %loopIdx" - @check_not "iter_values({{.*}}= %loopIdx" - ct.code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Int32,1,spec1d}, - Int32, ct.Constant{Int,4}, ct.Constant{Int,4}}) do DB, Locks, num_iters, GROUP_SIZE_M, TILE_N - bid_m = ct.bid(1) - # group_bid_m: 1-indexed group ID - group_bid_m = ((bid_m - Int32(1)) % Int32(GROUP_SIZE_M[])) + Int32(1) - - j = Int32(1) - while j <= num_iters - # Spinloop should use group_bid_m for the lock, not j - while ct.atomic_cas(Locks, group_bid_m, Int32(0), Int32(1); - memory_order=ct.MemoryOrder.Acquire) == Int32(1) - end - - val = ct.full((1, TILE_N[]), 1.0f0, Float32) - ct.store(DB, (group_bid_m, j), val) - - ct.atomic_xchg(Locks, group_bid_m, Int32(0); - memory_order=ct.MemoryOrder.Release) - - j += Int32(1) - end - return - end - end - end - - @testset "nested while loops compile correctly (regression test)" begin - # Regression test: Nested while loops must compile without errors. - # Previously, nested WhileOp caused "operand index out of bounds" errors - # during bytecode parsing due to value ID conflicts in nested regions. - spec1d = ct.ArraySpec{1}(16, true) - @test @filecheck begin - @check_label "entry" - @check "loop iter_values" - @check "loop iter_values" - code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, ct.TileArray{Int32,1,spec1d}}) do Locks1, Locks2 - idx = ct.bid(1) - - # Outer spinloop - while ct.atomic_cas(Locks1, idx, Int32(0), Int32(1); - memory_order=ct.MemoryOrder.Acquire) == Int32(1) - # Inner spinloop - while ct.atomic_cas(Locks2, idx, Int32(0), Int32(1); - memory_order=ct.MemoryOrder.Acquire) == Int32(1) - end - end - - return - end - end - end - - @testset "counting while loop with nested control flow upgrades to for (regression test)" begin - # Regression test: A counting while loop (j = 0; while j < n; j += 1) should be - # upgraded to ForOp even when it contains nested control flow (like inner loops). - # Previously, the for-loop pattern detection only searched for the step expression - # in the immediate body block, missing it when nested control flow was present. - spec1d = ct.ArraySpec{1}(16, true) - @test @filecheck begin - @check_label "entry" - @check "for %loopIdx in" - @check "loop iter_values" - code_tiled(Tuple{ct.TileArray{Int32,1,spec1d}, Int32}) do Locks, num_iters - idx = ct.bid(1) - - j = Int32(1) - while j <= num_iters - # Inner spinloop - the presence of this nested loop shouldn't prevent - # the outer loop from being detected as a for-loop pattern - while ct.atomic_cas(Locks, idx, Int32(0), Int32(1); - memory_order=ct.MemoryOrder.Acquire) == Int32(1) - end - j += Int32(1) - end - - return - end - end - end - - @testset "Multiple loop results (regression test)" begin - # Regression test: A while loop with multiple iter_args must generate - # different result indices (%for#0, %for#1, etc.) for each result. - # Previously, all loop results resolved to %for#0, causing incorrect code. - TILE_M = 32 - TILE_N = 1024 - - # Use ArraySpec with shape_div_by to match real CuArray behavior - spec2d = ct.ArraySpec{2}(128, true, (4, 0), (32, 32)) - spec1d = ct.ArraySpec{1}(128, true, (0,), (32,)) - - @test @filecheck begin - @check_label "entry" - # The for loop should have multiple results - @check "for %loopIdx in" - # We should see both %for#0 and %for#1 used (not the same one twice) - @check "reduce %for#1" - @check "reduce %for#0" - code_tiled(Tuple{ct.TileArray{Float32, 2, spec2d}, ct.TileArray{Float32, 2, spec2d}, - ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}, - ct.Constant{Int, TILE_M}, ct.Constant{Int, TILE_N}}) do DW, DB, FINAL_DW, FINAL_DB, _TILE_M, _TILE_N - bid_n = ct.bid(1) - num_tiles = ct.num_tiles(DW, 2, (_TILE_N[], _TILE_M[])) - - dw = ct.zeros((_TILE_N[], _TILE_M[]), Float32) - db = ct.zeros((_TILE_N[], _TILE_M[]), Float32) - i = Int32(1) - while i <= num_tiles - dw = dw .+ ct.load(DW, (bid_n, i), (_TILE_N[], _TILE_M[]); padding_mode=ct.PaddingMode.Zero) - db = db .+ ct.load(DB, (bid_n, i), (_TILE_N[], _TILE_M[]); padding_mode=ct.PaddingMode.Zero) - i += Int32(1) - end - - sum_dw = sum(dw; dims=2) - sum_db = sum(db; dims=2) - - ct.store(FINAL_DW, bid_n, sum_dw) - ct.store(FINAL_DB, bid_n, sum_db) - return - end - end - end - - @testset "sequential for loops with shared accumulator value" begin - # Regression test: Two sequential for loops where the second loop both: - # 1. Uses a value computed from the first loop's reduction - # 2. Has its own accumulator (loop-carried value) - # - # This pattern appears in LayerNorm forward pass where: - # - First loop computes mean/variance - # - Second loop normalizes using those computed values while accumulating - # - # Test: Sequential for loops where the second loop uses a value computed from - # the first loop's result AND has its own loop-carried accumulator. - # This exercises correct SSA index storage across multiple ForOps. - spec = ct.ArraySpec{1}(16, true) - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}, - ct.Constant{Int,16}}) do out, inp, TILE_N - bid = ct.bid(1) - num_tiles = ct.num_tiles(inp, 1, (TILE_N[],)) - - # First loop: accumulate and reduce - @check "for" - acc = ct.zeros((TILE_N[],), Float32) - i = Int32(1) - while i <= num_tiles - tile = ct.load(inp, i, (TILE_N[],); padding_mode=ct.PaddingMode.Zero) - acc = acc .+ tile - i += Int32(1) - end - @check "reduce" - sum_val = sum(acc; dims=1) - - # Second loop: use sum_val AND accumulate - @check "for" - acc2 = ct.zeros((TILE_N[],), Float32) - i = Int32(1) - while i <= num_tiles - tile = ct.load(inp, i, (TILE_N[],); padding_mode=ct.PaddingMode.Zero) - @check "subf" - acc2 = acc2 .+ (tile .- sum_val) # Uses sum_val from first loop - i += Int32(1) - end - @check "reduce" - @check "store_view_tko" - ct.store(out, bid, sum(acc2; dims=1)) - - return - end - end - end - - #========================================================================= - Gather/Scatter Operations - =========================================================================# - @testset "gather/scatter" begin - spec = ct.ArraySpec{1}(16, true) - - @testset "1D gather" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b - pid = ct.bid(1) - # Create index tile (simple: just use arange) - @check "iota" - indices = ct.arange((16,), Int32) - # Gather from array - @check "offset" - @check "load_ptr_tko" - tile = ct.gather(a, indices) - ct.store(b, pid, tile) - return - end - end - end - - @testset "1D scatter" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b - pid = ct.bid(1) - # Load tile - tile = ct.load(a, pid, (16,)) - # Create index tile (simple: just use arange) - @check "iota" - indices = ct.arange((16,), Int32) - # Scatter to array - @check "offset" - @check "store_ptr_tko" - ct.scatter(b, indices, tile) - return - end - end - end - - @testset "1D gather with Int indices" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b - pid = ct.bid(1) - # Use Int (Int64) to test type conversion - @check "iota" - indices = ct.arange((16,), Int) - # Should convert to Int32 internally - @check "trunci" - @check "offset" - @check "load_ptr_tko" - tile = ct.gather(a, indices) - ct.store(b, pid, tile) - return - end - end - end - - @testset "1D scatter with Int indices" begin - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}, ct.TileArray{Float32,1,spec}}) do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - # Use Int (Int64) to test type conversion - @check "iota" - indices = ct.arange((16,), Int) - # Should convert to Int32 internally - @check "trunci" - @check "offset" - @check "store_ptr_tko" - ct.scatter(b, indices, tile) - return - end - end - end - end - - #========================================================================= - Type Validation - =========================================================================# - @testset "type validation" begin - spec = ct.ArraySpec{1}(16, true) - - @testset "binary op type mismatch errors in Julia" begin - # This should fail with an IRError, since the intrinsic - # is invoked with mismatched types (Int32 + Int64) - @test_throws ct.IRError code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a - pid = ct.bid(1) # Int32 - # Force type mismatch by calling addi with different types - result = ct.Intrinsics.addi(pid, Int64(1)) - return - end - end - end - - @testset "method error detection" begin - spec = ct.ArraySpec{1}(16, true) - - isdefined(Core, :throw_methoderror) && - @testset "mismatched tile shapes with + produces MethodError" begin - spec2d = ct.ArraySpec{2}(16, true) - @test_throws "MethodError during Tile IR compilation" begin - code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a - pid = ct.bid(1) - tile_a = ct.load(a, pid, (4, 8)) - tile_b = ct.load(a, pid, (8, 4)) - Base.donotdelete(tile_a + tile_b) - return - end - end - end - - isdefined(Core, :throw_methoderror) && - @testset "no matching method produces MethodError" begin - only_ints(x::Int) = x - @test_throws "MethodError during Tile IR compilation" begin - code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a - tile = ct.load(a, ct.bid(1), (16,)) - only_ints(tile) - return - end - end - end - - @testset "unsupported function produces clear error" begin - @test_throws "Unsupported function call during Tile IR compilation" begin - code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a - tile = ct.load(a, ct.bid(1), (16,)) - print(tile) - return - end - end - end - end - - #========================================================================= - Tile Shape Validation - =========================================================================# - @testset "tile shape validation" begin - spec = ct.ArraySpec{1}(16, true) - spec2d = ct.ArraySpec{2}(16, true) - - @testset "non-power-of-2 load shape rejected" begin - @test_throws "load: tile dimension 1 must be a power of 2, got 3" begin - code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a - ct.load(a, ct.bid(1), (3,)) - end - end - end - - @testset "non-power-of-2 full shape rejected" begin - @test_throws "full: tile dimension 1 must be a power of 2, got 5" begin - code_tiled(Tuple{}) do - ct.full((5,), 0.0f0, Float32) - end - end - end - - @testset "non-power-of-2 arange shape rejected" begin - @test_throws "arange: tile dimension 1 must be a power of 2, got 7" begin - code_tiled(Tuple{}) do - ct.arange((7,), Int32) - end - end - end - - @testset "non-power-of-2 reshape target rejected" begin - @test_throws "reshape: tile dimension 1 must be a power of 2, got 3" begin - code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a - tile = ct.load(a, ct.bid(1), (16,)) - reshape(tile, (3,)) - end - end - end - - @testset "zero dimension rejected" begin - @test_throws "load: tile dimension 1 must be positive, got 0" begin - code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a - ct.load(a, ct.bid(1), (0,)) - end - end - end - - @testset "negative dimension rejected" begin - @test_throws "full: tile dimension 1 must be positive, got -4" begin - code_tiled(Tuple{}) do - ct.full((-4,), 0.0f0, Float32) - end - end - end - - @testset "valid power-of-2 shapes accepted" begin - # These should not throw - test a few key sizes - code_tiled(devnull, a -> (ct.store(a, ct.bid(1), ct.load(a, ct.bid(1), (16,))); return), - Tuple{ct.TileArray{Float32,1,spec}}) - code_tiled(devnull, a -> (ct.store(a, ct.bid(1), ct.load(a, ct.bid(1), (32,))); return), - Tuple{ct.TileArray{Float32,1,spec}}) - code_tiled(devnull, a -> (ct.store(a, ct.bid(1), ct.load(a, ct.bid(1), (128,))); return), - Tuple{ct.TileArray{Float32,1,spec}}) - end - - @testset "multi-dim: all dimensions must be pow2" begin - @test_throws "load: tile dimension 2 must be a power of 2, got 3" begin - code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a - ct.load(a, (ct.bid(1), 1), (4, 3)) - end - end - end - end - - #========================================================================= - Constant Folding - =========================================================================# - @testset "constant folding" begin - spec = ct.ArraySpec{1}(16, true) - - # XXX: This test verifies that store() returns the tile to enable constant - # folding. If this test fails after removing `return tile` from store(), - # Julia's optimizer will emit subi operations for constant index math. - # See operations.jl store() for the workaround. - @testset "store with constant index folds subtraction" begin - @test @filecheck begin - @check_label "entry" - @check "load_view_tko" - # Verify no subi appears between load and store - constant 1-1 should fold to 0 - @check_not "subi" - @check "store_view_tko" - code_tiled(Tuple{ct.TileArray{Float32,1,spec}}) do a - idx = Int32(1) - tile = ct.load(a, idx, (16,)) - ct.store(a, idx, tile) - return - end - end - end - end -end - -#============================================================================= - External Constants (GlobalRef handling) -=============================================================================# - -# Constants defined outside the kernel (module-level `const`) appear as GlobalRef -# nodes in Julia IR. These must emit proper ConstantOp for numeric types, -# not ghost values (which produce nothing in the bytecode). - -const _CODEGEN_TEST_FLOAT32 = Float32(1 / log(2)) -const _CODEGEN_TEST_FLOAT64 = 3.14159 - -@testset "External Constants" begin - spec1d = ct.ArraySpec{1}(16, true) - - @testset "external Float32 constant in arithmetic" begin - # Bug 1: GlobalRef for Float32 must emit ConstantOp, not a ghost value. - # Previously, emit_value!(ctx, ::GlobalRef) wrapped all values as ghosts, - # causing MulFOp to receive `nothing` instead of a bytecode Value. - @test @filecheck begin - @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - @check "constant " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=4) do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,)) - ct.store(a, pid, t) - return nothing - end - end - end - - @testset "occupancy only" begin - @test @filecheck begin - @check "optimization_hints=" - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=8) do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,)) - ct.store(a, pid, t) - return nothing - end - end - end - - @testset "both hints" begin - @test @filecheck begin - @check "optimization_hints=" - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120", num_ctas=4) do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,)) - ct.store(a, pid, t) - return nothing - end - end - end - - @testset "num_ctas validation" begin - # Too small - @test_throws "num_ctas must be between 1 and 16" begin - code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=0) - end - - # Too large - @test_throws "num_ctas must be between 1 and 16" begin - code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=17) - end - - # Not power of 2 - @test_throws "num_ctas must be a power of 2" begin - code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=3) - end - - @test_throws "num_ctas must be a power of 2" begin - code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas=5) - end - - # Valid values should succeed - for num_ctas in [1, 2, 4, 8, 16] - @test @filecheck begin - @check "num_cta_in_cga = $(num_ctas)" - ct.code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", num_ctas) - end - end - end - - @testset "occupancy validation" begin - # Too small - @test_throws "occupancy must be between 1 and 32" begin - code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=0) - end - - # Too large - @test_throws "occupancy must be between 1 and 32" begin - code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=33) - end - - # Valid boundaries - @test @filecheck begin - @check "occupancy = 1" - ct.code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=1) - end - - @test @filecheck begin - @check "occupancy = 32" - ct.code_tiled((a) -> nothing, Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_100", occupancy=32) - end - end -end - -#============================================================================= - Load / Store Hints (operation-level optimization hints) -=============================================================================# - -@testset "Load / Store Optimization Hints" begin - # Common ArraySpecs for tests - spec1d = ct.ArraySpec{1}(16, true) - - @testset "latency only on load" begin - @test @filecheck begin - @check "load_view_tko" - @check "optimization_hints = " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,); latency=5) - ct.store(a, pid, t) - return nothing - end - end - end - - @testset "allow_tma=false only on load" begin - @test @filecheck begin - @check "load_view_tko" - @check "optimization_hints = " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,); allow_tma=false) - ct.store(a, pid, t) - return nothing - end - end - end - - @testset "both hints on load" begin - @test @filecheck begin - @check "load_view_tko" - @check "optimization_hints = " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,); latency=7, allow_tma=false) - ct.store(a, pid, t) - return nothing - end - end - end - - @testset "latency only on store" begin - @test @filecheck begin - @check "store_view_tko" - @check "optimization_hints = " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,)) - ct.store(a, pid, t; latency=3) - return nothing - end - end - end - - @testset "allow_tma=false only on store" begin - @test @filecheck begin - @check "store_view_tko" - @check "optimization_hints = " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,)) - ct.store(a, pid, t; allow_tma=false) - return nothing - end - end - end - - @testset "both hints on store" begin - @test @filecheck begin - @check "store_view_tko" - @check "optimization_hints = " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,)) - ct.store(a, pid, t; allow_tma=false, latency=2) - return nothing - end - end - end - - @testset "latency validation" begin - @test_throws "latency must be between 1 and 10" begin - code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - ct.load(a, pid, (16,); latency=11) - end - end - - @test @filecheck begin - @check "optimization_hints = " - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a - pid = ct.bid(1) - t = ct.load(a, pid, (16,); latency=8) - ct.store(a, pid, t) - return nothing - end - end - end - - @testset "multiple operations with mixed hints" begin - @test @filecheck begin - # First load with latency - @check "load_view_tko" - @check "optimization_hints = " - # Second load with allow_tma=false - @check "load_view_tko" - @check "optimization_hints = " - # Third load with no hints - @check "load_view_tko" - @check_not "optimization_hints" - ct.code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, - ct.TileArray{Float32, 1, spec1d}, - ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b, c - pid = ct.bid(1) - t1 = ct.load(a, pid, (16,); latency=5) - t2 = ct.load(b, pid, (16,); allow_tma=false) - t3 = ct.load(c, pid, (16,)) - result = t1 + t2 + t3 - ct.store(a, pid, result) - return nothing - end - end - end - - # Pointer-based operations (gather/scatter) with latency hints - @testset "gather with latency hint" begin - @test @filecheck begin - @check "load_ptr_tko" - @check "optimization_hints = " - code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b - pid = ct.bid(1) - indices = ct.arange((16,), Int32) - tile = ct.gather(a, indices; latency=3) - ct.store(b, pid, tile) - return nothing - end - end - end - - @testset "scatter with latency hint" begin - @test @filecheck begin - @check "store_ptr_tko" - @check "optimization_hints = " - code_tiled(Tuple{ct.TileArray{Float32, 1, spec1d}, ct.TileArray{Float32, 1, spec1d}}; sm_arch="sm_120") do a, b - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - indices = ct.arange((16,), Int32) - ct.scatter(b, indices, tile; latency=5) - return nothing - end - end - end -end diff --git a/test/execution.jl b/test/execution.jl deleted file mode 100644 index e01adb0..0000000 --- a/test/execution.jl +++ /dev/null @@ -1,3257 +0,0 @@ -using CUDA - -@testset "launch" begin - -@testset "1D vector add" begin - function vadd_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a + tile_b) - return - end - - n = 1024 - tile_size = 16 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_1d, cld(n, tile_size), a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "1D vector sub" begin - function vsub_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a - tile_b) - return - end - - n = 1024 - tile_size = 16 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(vsub_1d, cld(n, tile_size), a, b, c) - - @test Array(c) ≈ Array(a) - Array(b) -end - -@testset "1D vector mul" begin - function vmul_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a .* tile_b) - return - end - - n = 1024 - tile_size = 16 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(vmul_1d, cld(n, tile_size), a, b, c) - - @test Array(c) ≈ Array(a) .* Array(b) -end - -@testset "2D matrix add" begin - function madd_2d(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - bidx = ct.bid(1) - bidy = ct.bid(2) - tile_a = ct.load(a, (bidx, bidy), (32, 32)) - tile_b = ct.load(b, (bidx, bidy), (32, 32)) - ct.store(c, (bidx, bidy), tile_a + tile_b) - return - end - - m, n = 256, 256 - tile_x, tile_y = 32, 32 - a = CUDA.rand(Float32, m, n) - b = CUDA.rand(Float32, m, n) - c = CUDA.zeros(Float32, m, n) - - ct.launch(madd_2d, (cld(m, tile_x), cld(n, tile_y)), a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "4D tensor add" begin - # 4D loads require TileArray with explicit sizes (grid only provides 3D) - function tadd_4d(a::ct.TileArray{Float32,4}, b::ct.TileArray{Float32,4}, - c::ct.TileArray{Float32,4}) - bidx = ct.bid(1) - bidy = ct.bid(2) - bidz = ct.bid(3) - # Load 4D tiles - 4th dimension index is fixed at 1 - tile_a = ct.load(a, (bidx, bidy, bidz, 1), (4, 4, 4, 2)) - tile_b = ct.load(b, (bidx, bidy, bidz, 1), (4, 4, 4, 2)) - ct.store(c, (bidx, bidy, bidz, 1), tile_a + tile_b) - return - end - - # Array shape: (d1, d2, d3, d4) with tile shape (4, 4, 4, 2) - d1, d2, d3, d4 = 16, 16, 8, 2 - tile_1, tile_2, tile_3, tile_4 = 4, 4, 4, 2 - a = CUDA.rand(Float32, d1, d2, d3, d4) - b = CUDA.rand(Float32, d1, d2, d3, d4) - c = CUDA.zeros(Float32, d1, d2, d3, d4) - - grid = (cld(d1, tile_1), cld(d2, tile_2), cld(d3, tile_3)) - ct.launch(tadd_4d, grid, a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "rank mismatch load/store" begin - @testset "1D shape on 2D array" begin - function copy_1d_2d(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2}) - bid = ct.bid(1) - tile = ct.load(src, (bid, 1), (16,)) - ct.store(dst, (bid, 1), tile) - return - end - - m = 64 - src = CUDA.rand(Float32, m, 1) - dst = CUDA.zeros(Float32, m, 1) - - ct.launch(copy_1d_2d, cld(m, 16), src, dst) - - @test Array(dst) ≈ Array(src) - end - - @testset "2D shape on 4D array" begin - function copy_2d_4d(src::ct.TileArray{Float32,4}, dst::ct.TileArray{Float32,4}) - bidx = ct.bid(1) - bidy = ct.bid(2) - tile = ct.load(src, (bidx, bidy, 1, 1), (4, 4)) - ct.store(dst, (bidx, bidy, 1, 1), tile) - return - end - - d1, d2 = 16, 16 - src = CUDA.rand(Float32, d1, d2, 1, 1) - dst = CUDA.zeros(Float32, d1, d2, 1, 1) - - ct.launch(copy_2d_4d, (cld(d1, 4), cld(d2, 4)), src, dst) - - @test Array(dst) ≈ Array(src) - end -end - -@testset "transpose" begin - function transpose_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) - bidx = ct.bid(1) - bidy = ct.bid(2) - tile = ct.load(x, (bidx, bidy), (32, 32)) - transposed = transpose(tile) - ct.store(y, (bidy, bidx), transposed) - return - end - - m, n = 256, 128 - tile_size = 32 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, n, m) - - ct.launch(transpose_kernel, (cld(m, tile_size), cld(n, tile_size)), x, y) - - @test Array(y) ≈ transpose(Array(x)) -end - -@testset "reshape" begin - @testset "2D -> 1D reshape preserves elements" begin - function reshape_2d_to_1d_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,1}) - bid = ct.bid(1) - # Load a 4x8 tile - tile = ct.load(x, (bid, 1), (4, 8)) - # Reshape to 32 elements (flat) - reshaped = reshape(tile, (32,)) - ct.store(y, bid, reshaped) - return - end - - m, n = 64, 8 - x = CUDA.rand(Float32, m, n) - # Each of the m/4 = 16 blocks produces 32 elements - y = CUDA.zeros(Float32, m * n) - - ct.launch(reshape_2d_to_1d_kernel, cld(m, 4), x, y) - - # Verify all elements are preserved (same multiset) - x_cpu = Array(x) - y_cpu = Array(y) - for bid in 0:(cld(m, 4)-1) - row_start = bid * 4 + 1 - row_end = row_start + 3 - input_elements = sort(vec(x_cpu[row_start:row_end, 1:8])) - output_elements = sort(y_cpu[(bid*32+1):((bid+1)*32)]) - @test output_elements ≈ input_elements - end - end - - @testset "1D -> 2D reshape preserves elements" begin - function reshape_1d_to_2d_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load 32 elements - tile = ct.load(x, bid, (32,)) - # Reshape to 4x8 - reshaped = reshape(tile, (4, 8)) - ct.store(y, (bid, 1), reshaped) - return - end - - n = 512 - x = CUDA.rand(Float32, n) - m_out = n ÷ 8 - y = CUDA.zeros(Float32, m_out, 8) - - ct.launch(reshape_1d_to_2d_kernel, cld(n, 32), x, y) - - # Verify all elements are preserved (same multiset) - x_cpu = Array(x) - y_cpu = Array(y) - for bid in 0:(cld(n, 32)-1) - start_idx = bid * 32 + 1 - input_elements = sort(x_cpu[start_idx:(start_idx+31)]) - row_start = bid * 4 + 1 - output_elements = sort(vec(y_cpu[row_start:(row_start+3), 1:8])) - @test output_elements ≈ input_elements - end - end - - @testset "reshape roundtrip preserves data" begin - function reshape_roundtrip_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load 8x4 tile - tile = ct.load(x, (bid, 1), (8, 4)) - # Reshape to 32, then back to 8x4 - flat = reshape(tile, (32,)) - back = reshape(flat, (8, 4)) - ct.store(y, (bid, 1), back) - return - end - - m, n = 64, 4 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, m, n) - - ct.launch(reshape_roundtrip_kernel, cld(m, 8), x, y) - - @test Array(y) ≈ Array(x) - end -end - -@testset "reshape column-major semantics" begin - # These tests verify that reshape matches Julia's column-major reshape behavior, - # not just that elements are preserved (which would pass even with wrong ordering). - # Note: tile shapes must be powers of 2. - - @testset "1D → 2D matches Julia reshape exactly" begin - function reshape_1d_to_2d_exact_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,2}, - n::ct.Constant{Int}, shape::ct.Constant{NTuple{2,Int}}) - bid = ct.bid(1) - tile = ct.load(x, bid, (n[],)) - reshaped = reshape(tile, shape[]) - ct.store(y, (bid, 1), reshaped) - return - end - - n = 32 - shape = (4, 8) - # Sequential values to detect any reordering - x = CuArray(Float32.(1:n)) - y = CUDA.zeros(Float32, shape) - - ct.launch(reshape_1d_to_2d_exact_kernel, 1, x, y, ct.Constant(n), ct.Constant(shape)) - - # Must match Julia's column-major reshape exactly (not just same elements) - expected = reshape(Float32.(1:n), shape) - @test Array(y) ≈ expected - end - - @testset "2D → 1D matches Julia vec exactly" begin - function reshape_2d_to_1d_exact_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,1}, - shape::ct.Constant{NTuple{2,Int}}, n::ct.Constant{Int}) - bid = ct.bid(1) - tile = ct.load(x, (bid, 1), shape[]) - reshaped = reshape(tile, (n[],)) - ct.store(y, bid, reshaped) - return - end - - shape = (4, 8) - n = prod(shape) - # Create 2D array with sequential column-major values - x = CuArray(Float32.(reshape(1:n, shape))) - y = CUDA.zeros(Float32, n) - - ct.launch(reshape_2d_to_1d_exact_kernel, 1, x, y, ct.Constant(shape), ct.Constant(n)) - - # Flattening should give column-major order: 1,2,3,4,...,32 - expected = vec(Float32.(reshape(1:n, shape))) - @test Array(y) ≈ expected - end - - @testset "2D → 2D reshape matches Julia reshape exactly" begin - function reshape_2d_to_2d_exact_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}, - src_shape::ct.Constant{NTuple{2,Int}}, - tgt_shape::ct.Constant{NTuple{2,Int}}) - bid = ct.bid(1) - tile = ct.load(x, (bid, 1), src_shape[]) - reshaped = reshape(tile, tgt_shape[]) - ct.store(y, (bid, 1), reshaped) - return - end - - src_shape = (4, 8) - tgt_shape = (8, 4) - n = prod(src_shape) - x = CuArray(Float32.(reshape(1:n, src_shape))) - y = CUDA.zeros(Float32, tgt_shape) - - ct.launch(reshape_2d_to_2d_exact_kernel, 1, x, y, - ct.Constant(src_shape), ct.Constant(tgt_shape)) - - expected = reshape(Float32.(reshape(1:n, src_shape)), tgt_shape) - @test Array(y) ≈ expected - end - - @testset "3D → 2D reshape matches Julia reshape exactly" begin - function reshape_3d_to_2d_exact_kernel(x::ct.TileArray{Float32,3}, y::ct.TileArray{Float32,2}, - src_shape::ct.Constant{NTuple{3,Int}}, - tgt_shape::ct.Constant{NTuple{2,Int}}) - bid = ct.bid(1) - tile = ct.load(x, (bid, 1, 1), src_shape[]) - reshaped = reshape(tile, tgt_shape[]) - ct.store(y, (bid, 1), reshaped) - return - end - - src_shape = (2, 4, 4) - tgt_shape = (8, 4) - n = prod(src_shape) - x = CuArray(Float32.(reshape(1:n, src_shape))) - y = CUDA.zeros(Float32, tgt_shape) - - ct.launch(reshape_3d_to_2d_exact_kernel, 1, x, y, - ct.Constant(src_shape), ct.Constant(tgt_shape)) - - expected = reshape(Float32.(reshape(1:n, src_shape)), tgt_shape) - @test Array(y) ≈ expected - end - - @testset "3D reshape round-trip with packing dim D=$D" for D in [2, 4] - # This is the atom_packing pattern: (BS, N, 2) → (BS, N*2/D, D) → (BS, N, 2) - function reshape_roundtrip_3d_kernel(x::ct.TileArray{Float32,3}, y::ct.TileArray{Float32,3}, - orig_shape::ct.Constant{NTuple{3,Int}}, - packed_shape::ct.Constant{NTuple{3,Int}}) - bid = ct.bid(1) - tile = ct.load(x, (bid, 1, 1), orig_shape[]) - packed = reshape(tile, packed_shape[]) - unpacked = reshape(packed, orig_shape[]) - ct.store(y, (bid, 1, 1), unpacked) - return - end - - BS, N = 1, 8 - orig_shape = (BS, N, 2) - packed_shape = (BS, N * 2 ÷ D, D) - - # Sequential values to detect any reordering - x = CuArray(Float32.(reshape(1:prod(orig_shape), orig_shape))) - y = CUDA.zeros(Float32, orig_shape) - - ct.launch(reshape_roundtrip_3d_kernel, 1, x, y, - ct.Constant(orig_shape), ct.Constant(packed_shape)) - - # Round-trip must preserve exact data, not just same elements - @test Array(y) ≈ Array(x) - end - - @testset "2D → 1D → 2D round-trip preserves exact layout" begin - function reshape_2d_1d_2d_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}, - shape::ct.Constant{NTuple{2,Int}}) - bid = ct.bid(1) - tile = ct.load(x, (bid, 1), shape[]) - flat = reshape(tile, (prod(shape[]),)) - back = reshape(flat, shape[]) - ct.store(y, (bid, 1), back) - return - end - - shape = (4, 8) - x = CuArray(Float32.(reshape(1:prod(shape), shape))) - y = CUDA.zeros(Float32, shape) - - ct.launch(reshape_2d_1d_2d_kernel, 1, x, y, ct.Constant(shape)) - - @test Array(y) ≈ Array(x) - end -end - -@testset "permutedims" begin - @testset "2D permutedims (transpose-like)" begin - function permute_2d_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load 8x4 tile - tile = ct.load(x, (bid, 1), (8, 4)) - # Permute with (2, 1) to swap dimensions: (8, 4) -> (4, 8) - permuted = permutedims(tile, (2, 1)) - ct.store(y, (bid, 1), permuted) - return - end - - m, n = 64, 4 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, cld(m, 8) * 4, 8) - - ct.launch(permute_2d_kernel, cld(m, 8), x, y) - - # Verify permutedims matches transpose - x_cpu = Array(x) - y_cpu = Array(y) - for bid in 0:(cld(m, 8)-1) - row_start = bid * 8 + 1 - input_tile = x_cpu[row_start:(row_start+7), 1:4] - out_row_start = bid * 4 + 1 - output_tile = y_cpu[out_row_start:(out_row_start+3), 1:8] - # Compare sorted values since memory layouts may differ - @test sort(vec(output_tile)) ≈ sort(vec(transpose(input_tile))) - end - end - - @testset "permutedims roundtrip preserves data" begin - function permute_roundtrip_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load 4x8 tile - tile = ct.load(x, (bid, 1), (4, 8)) - # Permute with (2, 1), then back with (2, 1) - permuted = permutedims(tile, (2, 1)) # (4, 8) -> (8, 4) - back = permutedims(permuted, (2, 1)) # (8, 4) -> (4, 8) - ct.store(y, (bid, 1), back) - return - end - - m, n = 64, 8 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, m, n) - - ct.launch(permute_roundtrip_kernel, cld(m, 4), x, y) - - @test Array(y) ≈ Array(x) - end -end - -@testset "strided" begin - @testset "PermutedDimsArray" begin - function copy_kernel_2d( - src::ct.TileArray{Float32, 2}, dst::ct.TileArray{Float32, 2}, - tile_x::ct.Constant{Int}, tile_y::ct.Constant{Int} - ) - bid_x = ct.bid(1) - bid_y = ct.bid(2) - tile = ct.load(src, (bid_x, bid_y), (tile_x[], tile_y[])) - ct.store(dst, (bid_x, bid_y), tile) - return - end - - m, n = 64, 32 - tm, tn = 16, 16 - A = CuArray(Float32.(reshape(1:n*m, n, m))) - P = PermutedDimsArray(A, (2, 1)) - out = CUDA.zeros(Float32, m, n) - - grid = (cld(m, tm), cld(n, tn)) - ct.launch(copy_kernel_2d, grid, P, out, ct.Constant(tm), ct.Constant(tn)) - - @test out == permutedims(A, (2, 1)) - end - - @testset "load with order=(2,1)" begin - function order_load_kernel( - src::ct.TileArray{Float32, 2}, dst::ct.TileArray{Float32, 2}, - t::ct.Constant{Int} - ) - bid_x = ct.bid(1) - bid_y = ct.bid(2) - tile = ct.load(src, (bid_x, bid_y), (t[], t[]); order=(2, 1)) - ct.store(dst, (bid_x, bid_y), tile) - return - end - - n = 64; t = 16 - src = CuArray(Float32.(reshape(1:n*n, n, n))) - dst = CUDA.zeros(Float32, n, n) - - ct.launch(order_load_kernel, (cld(n, t), cld(n, t)), src, dst, ct.Constant(t)) - - @test Array(dst) ≈ transpose(Array(src)) - end - - @testset "store with order=(2,1)" begin - function order_store_kernel( - src::ct.TileArray{Float32, 2}, dst::ct.TileArray{Float32, 2}, - t::ct.Constant{Int} - ) - bid_x = ct.bid(1) - bid_y = ct.bid(2) - tile = ct.load(src, (bid_x, bid_y), (t[], t[])) - ct.store(dst, (bid_x, bid_y), tile; order=(2, 1)) - return - end - - n = 64; t = 16 - src = CuArray(Float32.(reshape(1:n*n, n, n))) - dst = CUDA.zeros(Float32, n, n) - - ct.launch(order_store_kernel, (cld(n, t), cld(n, t)), src, dst, ct.Constant(t)) - - @test Array(dst) ≈ transpose(Array(src)) - end -end - -@testset "extract" begin - @testset "extract identity (0,0) full shape" begin - function extract_identity_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load 4x8 tile - tile = ct.load(x, (bid, 1), (4, 8)) - # Extract the full tile starting at (0, 0) - extracted = ct.extract(tile, (2, 2), (4, 8)) - ct.store(y, (bid, 1), extracted) - return - end - - m, n = 64, 8 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, m, n) - - ct.launch(extract_identity_kernel, cld(m, 4), x, y) - - # Full extract at (0,0) should preserve data - @test Array(y) ≈ Array(x) - end - - @testset "extract (1,1) smaller shape" begin - function extract_smaller_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load 8x8 tile - tile = ct.load(x, (bid, 1), (8, 8)) - # Extract 4x4 at (1, 1) - top-left corner - extracted = ct.extract(tile, (1, 1), (4, 4)) - ct.store(y, (bid, 1), extracted) - return - end - - m, n = 64, 8 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, cld(m, 8) * 4, 4) - - ct.launch(extract_smaller_kernel, cld(m, 8), x, y) - - # Verify elements are preserved for top-left 4x4 - x_cpu = Array(x) - y_cpu = Array(y) - for bid in 0:(cld(m, 8)-1) - input_start = bid * 8 + 1 - input_slice = x_cpu[input_start:(input_start+3), 1:4] - output_start = bid * 4 + 1 - output_slice = y_cpu[output_start:(output_start+3), 1:4] - @test sort(vec(output_slice)) ≈ sort(vec(input_slice)) - end - end - - @testset "extract with slice indices" begin - # Extract uses SLICE INDICES, not offsets! - # For shape (8,8) -> (4,4): valid indices are {1,2} x {1,2} - # Index (2, 1) extracts rows 5-8 (the second slice in first dimension) - - function extract_all_quadrants_kernel(x::ct.TileArray{Float32,2}, - y0::ct.TileArray{Float32,2}, - y1::ct.TileArray{Float32,2}, - y2::ct.TileArray{Float32,2}, - y3::ct.TileArray{Float32,2}) - bid = ct.bid(1) - tile = ct.load(x, (bid, 1), (8, 8)) - # Extract all 4 quadrants - q0 = ct.extract(tile, (1, 1), (4, 4)) # Top-left - q1 = ct.extract(tile, (2, 1), (4, 4)) # Bottom-left - q2 = ct.extract(tile, (1, 2), (4, 4)) # Top-right - q3 = ct.extract(tile, (2, 2), (4, 4)) # Bottom-right - ct.store(y0, (bid, 1), q0) - ct.store(y1, (bid, 1), q1) - ct.store(y2, (bid, 1), q2) - ct.store(y3, (bid, 1), q3) - return - end - - # Create input with different values in each quadrant - x = CUDA.zeros(Float32, 8, 8) - x[1:4, 1:4] .= 1.0f0 # TL - x[5:8, 1:4] .= 2.0f0 # BL - x[1:4, 5:8] .= 3.0f0 # TR - x[5:8, 5:8] .= 4.0f0 # BR - - y0 = CUDA.zeros(Float32, 4, 4) - y1 = CUDA.zeros(Float32, 4, 4) - y2 = CUDA.zeros(Float32, 4, 4) - y3 = CUDA.zeros(Float32, 4, 4) - - ct.launch(extract_all_quadrants_kernel, 1, x, y0, y1, y2, y3) - - @test all(Array(y0) .≈ 1.0f0) # Top-left = 1 - @test all(Array(y1) .≈ 2.0f0) # Bottom-left = 2 - @test all(Array(y2) .≈ 3.0f0) # Top-right = 3 - @test all(Array(y3) .≈ 4.0f0) # Bottom-right = 4 - end - - @testset "extract real/imag pattern (FFT)" begin - # This is the pattern used in FFT: shape (BS, N, 2) -> (BS, N, 1) - # Real at slice index 1, imag at slice index 2 - - function extract_real_imag_kernel(x_ri::ct.TileArray{Float32,3}, - y_real::ct.TileArray{Float32,3}, - y_imag::ct.TileArray{Float32,3}) - bid = ct.bid(1) - tile = ct.load(x_ri, (bid, 1, 1), (2, 8, 2)) # (BS, N, real/imag) - # Extract real (slice 1) and imag (slice 2) in last dimension - real_part = ct.extract(tile, (1, 1, 1), (2, 8, 1)) - imag_part = ct.extract(tile, (1, 1, 2), (2, 8, 1)) - ct.store(y_real, (bid, 1, 1), real_part) - ct.store(y_imag, (bid, 1, 1), imag_part) - return - end - - # Create input: real=1.0, imag=2.0 - x = CUDA.zeros(Float32, 2, 8, 2) - x[:, :, 1] .= 1.0f0 # real - x[:, :, 2] .= 2.0f0 # imag - - y_real = CUDA.zeros(Float32, 2, 8, 1) - y_imag = CUDA.zeros(Float32, 2, 8, 1) - - ct.launch(extract_real_imag_kernel, 1, x, y_real, y_imag) - - @test all(Array(y_real) .≈ 1.0f0) # Real component - @test all(Array(y_imag) .≈ 2.0f0) # Imag component - end -end - -@testset "scalar tile getindex" begin - function tile_getindex_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,1}) - tile = ct.load(x, 1, (8,)) - scalar = tile[3] # Extract 3rd element - ct.store(y, 1, ct.broadcast_to(ct.Tile(scalar), (8,))) - return - end - host_x = zeros(Float32, 8) - host_x[3] = 42.0f0 - x = CuArray(host_x) - y = CUDA.zeros(Float32, 8) - ct.launch(tile_getindex_kernel, 1, x, y) - @test all(Array(y) .≈ 42.0f0) -end - -@testset "scalar tile setindex" begin - function tile_setindex_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,1}) - tile = ct.load(x, 1, (8,)) - new_tile = Base.setindex(tile, 0.0f0, 3) - ct.store(y, 1, new_tile) - return - end - x = CuArray(Float32.(1:8)) - y = CUDA.zeros(Float32, 8) - ct.launch(tile_setindex_kernel, 1, x, y) - expected = Float32.(1:8) - expected[3] = 0.0f0 - @test Array(y) ≈ expected -end - -@testset "cat" begin - @testset "cat along last axis (axis -1)" begin - function cat_last_axis_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load two (4, 4) tiles - tile_a = ct.load(a, (bid, 1), (4, 4)) - tile_b = ct.load(b, (bid, 1), (4, 4)) - # Concatenate along last axis -> (4, 8) - combined = ct.cat((tile_a, tile_b), Val(-1)) - ct.store(c, (bid, 1), combined) - return - end - - m, n = 64, 4 - a = CUDA.rand(Float32, m, n) - b = CUDA.rand(Float32, m, n) - c = CUDA.zeros(Float32, m, 8) - - ct.launch(cat_last_axis_kernel, cld(m, 4), a, b, c) - - # Verify concatenation: c[:, 1:4] should match a, c[:, 5:8] should match b - c_cpu = Array(c) - a_cpu = Array(a) - b_cpu = Array(b) - - # Due to memory layout, verify elements are preserved by checking sorted values - for bid in 0:(cld(m, 4)-1) - start_row = bid * 4 + 1 - input_a = a_cpu[start_row:(start_row+3), :] - input_b = b_cpu[start_row:(start_row+3), :] - output = c_cpu[start_row:(start_row+3), :] - - # Combined output should contain all elements from both inputs - expected = sort(vcat(vec(input_a), vec(input_b))) - actual = sort(vec(output)) - @test actual ≈ expected - end - end - - @testset "cat along first axis (axis 1)" begin - function cat_first_axis_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load two (4, 4) tiles - tile_a = ct.load(a, (bid, 1), (4, 4)) - tile_b = ct.load(b, (bid, 1), (4, 4)) - # Concatenate along first axis -> (8, 4) - combined = ct.cat((tile_a, tile_b), Val(1)) - ct.store(c, (bid, 1), combined) - return - end - - m, n = 32, 4 - a = CUDA.rand(Float32, m, n) - b = CUDA.rand(Float32, m, n) - c = CUDA.zeros(Float32, m * 2, n) - - ct.launch(cat_first_axis_kernel, cld(m, 4), a, b, c) - - # Verify concatenation: elements from both inputs should be preserved - c_cpu = Array(c) - a_cpu = Array(a) - b_cpu = Array(b) - - for bid in 0:(cld(m, 4)-1) - start_a = bid * 4 + 1 - start_c = bid * 8 + 1 - input_a = a_cpu[start_a:(start_a+3), :] - input_b = b_cpu[start_a:(start_a+3), :] - output = c_cpu[start_c:(start_c+7), :] - - # Combined output should contain all elements from both inputs - expected = sort(vcat(vec(input_a), vec(input_b))) - actual = sort(vec(output)) - @test actual ≈ expected - end - end - - @testset "cat roundtrip (extract then cat)" begin - # This tests cat as the inverse of extract: extract splits, cat joins - function extract_cat_roundtrip_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) - bid = ct.bid(1) - # Load 4x8 tile - tile = ct.load(x, (bid, 1), (4, 8)) - # Extract two 4x4 halves - left = ct.extract(tile, (1, 1), (4, 4)) # rows 1-4, cols 1-4 - right = ct.extract(tile, (1, 2), (4, 4)) # rows 1-4, cols 5-8 - # Cat them back together along last axis - combined = ct.cat((left, right), Val(-1)) - ct.store(y, (bid, 1), combined) - return - end - - m, n = 64, 8 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, m, n) - - ct.launch(extract_cat_roundtrip_kernel, cld(m, 4), x, y) - - # Output should match input (roundtrip) - x_cpu = Array(x) - y_cpu = Array(y) - - for bid in 0:(cld(m, 4)-1) - start_row = bid * 4 + 1 - input = x_cpu[start_row:(start_row+3), :] - output = y_cpu[start_row:(start_row+3), :] - - @test output ≈ input - end - end -end - -@testset "matmul" begin - @testset "basic matmul" begin - function matmul_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - bidx = ct.bid(1) - bidy = ct.bid(2) - # Load tiles: a is (M, K), b is (K, N) - tile_a = ct.load(a, (bidx, 1), (32, 16)) - tile_b = ct.load(b, (1, bidy), (16, 32)) - # matmul: c = a @ b (using * operator) - result = tile_a * tile_b - ct.store(c, (bidx, bidy), result) - return - end - - M, K, N = 64, 16, 64 - a = CUDA.rand(Float32, M, K) - b = CUDA.rand(Float32, K, N) - c = CUDA.zeros(Float32, M, N) - - grid_x = cld(M, 32) - grid_y = cld(N, 32) - ct.launch(matmul_kernel, (grid_x, grid_y, 1), a, b, c) - - # Verify against CPU reference - a_cpu = Array(a) - b_cpu = Array(b) - c_cpu = Array(c) - c_ref = a_cpu * b_cpu - - @test c_cpu ≈ c_ref - end -end - -end - -@testset "Constant parameters" begin - -@testset "1D with Constant tile size" begin - function vadd_const_tile(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}, tile::ct.Constant{Int}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (tile[],)) - tile_b = ct.load(b, pid, (tile[],)) - ct.store(c, pid, tile_a + tile_b) - return - end - - n = 1024 - tile_size = 32 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_const_tile, cld(n, tile_size), a, b, c, ct.Constant(tile_size)) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "2D with Constant tile sizes" begin - function madd_const_tiles(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}, - tx::ct.Constant{Int}, ty::ct.Constant{Int}) - bidx = ct.bid(1) - bidy = ct.bid(2) - tile_a = ct.load(a, (bidx, bidy), (tx[], ty[])) - tile_b = ct.load(b, (bidx, bidy), (tx[], ty[])) - ct.store(c, (bidx, bidy), tile_a + tile_b) - return - end - - m, n = 256, 256 - tile_x, tile_y = 64, 64 - a = CUDA.rand(Float32, m, n) - b = CUDA.rand(Float32, m, n) - c = CUDA.zeros(Float32, m, n) - - ct.launch(madd_const_tiles, (cld(m, tile_x), cld(n, tile_y)), a, b, c, - ct.Constant(tile_x), ct.Constant(tile_y)) - - @test Array(c) ≈ Array(a) + Array(b) -end - -end - -@testset "data types" begin - -@testset "Float64" begin - function vadd_f64(a::ct.TileArray{Float64,1}, b::ct.TileArray{Float64,1}, - c::ct.TileArray{Float64,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a + tile_b) - return - end - - n = 1024 - tile_size = 16 - a = CUDA.rand(Float64, n) - b = CUDA.rand(Float64, n) - c = CUDA.zeros(Float64, n) - - ct.launch(vadd_f64, cld(n, tile_size), a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "Float16" begin - function vadd_f16(a::ct.TileArray{Float16,1}, b::ct.TileArray{Float16,1}, - c::ct.TileArray{Float16,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a + tile_b) - return - end - - n = 1024 - tile_size = 16 - a = CUDA.rand(Float16, n) - b = CUDA.rand(Float16, n) - c = CUDA.zeros(Float16, n) - - ct.launch(vadd_f16, cld(n, tile_size), a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "BFloat16" begin - function vadd_bf16(a::ct.TileArray{ct.BFloat16,1}, b::ct.TileArray{ct.BFloat16,1}, - c::ct.TileArray{ct.BFloat16,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a + tile_b) - return - end - - n = 1024 - tile_size = 16 - a = CUDA.rand(ct.BFloat16, n) - b = CUDA.rand(ct.BFloat16, n) - c = CUDA.zeros(ct.BFloat16, n) - - ct.launch(vadd_bf16, cld(n, tile_size), a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -end - -@testset "compilation cache" begin - function cached_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - ct.store(b, pid, tile) - return - end - - n = 256 - tile_size = 16 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - # First launch triggers compilation - ct.launch(cached_kernel, cld(n, tile_size), a, b) - @test Array(b) ≈ Array(a) - - # Second launch should use cached CuFunction - a2 = CUDA.rand(Float32, n) - b2 = CUDA.zeros(Float32, n) - ct.launch(cached_kernel, cld(n, tile_size), a2, b2) - @test Array(b2) ≈ Array(a2) -end - -@testset "TileArray auto-conversion" begin - # Test that CuArrays are automatically converted to TileArray - function copy_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(src, pid, (16,)) - ct.store(dst, pid, tile) - return - end - - n = 512 - tile_size = 16 - src = CUDA.rand(Float32, n) - dst = CUDA.zeros(Float32, n) - - # Pass CuArrays directly - should auto-convert - ct.launch(copy_kernel, cld(n, tile_size), src, dst) - - @test Array(dst) ≈ Array(src) -end - -@testset "math operations" begin - -@testset "1D vector div" begin - function vdiv_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a ./ tile_b) - return - end - - n = 1024 - tile_size = 16 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) .+ 0.1f0 # Ensure non-zero - c = CUDA.zeros(Float32, n) - - ct.launch(vdiv_1d, cld(n, tile_size), a, b, c) - - @test Array(c) ≈ Array(a) ./ Array(b) -end - -for (op, name) in [ - (:sqrt, "sqrt"), (:abs, "abs"), (:cos, "cos"), (:sin, "sin"), - (:exp, "exp"), (:log, "log"), (:ceil, "ceil"), (:floor, "floor"), -] - @eval @testset "1D $($name)" begin - function $(Symbol("vmath_$(name)"))(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - ct.store(b, pid, $op.(tile)) - return - end - a = CUDA.rand(Float32, 1024) .+ 0.1f0 - b = CUDA.zeros(Float32, 1024) - ct.launch($(Symbol("vmath_$(name)")), cld(1024, 16), a, b) - @test Array(b) ≈ $op.(Array(a)) rtol=1e-4 - end -end - -end - -@testset "reduction operations" begin - -@testset "sum along axis 2" begin - function sum_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - sums = sum(tile; dims=2) - ct.store(b, pid, sums) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(sum_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 - end -end - -@testset "sum along axis 1" begin - function sum_axis1_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (1, pid), (64, 1)) - sums = sum(tile; dims=1) - ct.store(b, pid, sums) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, n) - - ct.launch(sum_axis1_kernel, n, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for j in 1:n - @test b_cpu[j] ≈ sum(a_cpu[:, j]) rtol=1e-3 - end -end - -@testset "maximum along axis 2" begin - function maximum_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - maxes = maximum(tile; dims=2) - ct.store(b, pid, maxes) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(maximum_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ maximum(a_cpu[i, :]) - end -end - -@testset "minimum along axis 2" begin - function minimum_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - mins = minimum(tile; dims=2) - ct.store(b, pid, mins) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(minimum_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ minimum(a_cpu[i, :]) - end -end - -@testset "prod along axis 2" begin - function prod_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - prods = prod(tile; dims=2) - ct.store(b, pid, prods) - return - end - - m, n = 64, 128 - # Use small values to avoid overflow/underflow - a = CuArray(rand(Float32, m, n) .* 0.1f0 .+ 0.95f0) - b = CUDA.zeros(Float32, m) - - ct.launch(prod_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ prod(a_cpu[i, :]) rtol=1e-2 - end -end - -@testset "reduce with custom combiner" begin - function custom_reduce_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - sums = reduce((x, y) -> x + y, tile; dims=2, init=0.0f0) - ct.store(b, pid, sums) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(custom_reduce_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 - end -end - -@testset "map(abs, tile)" begin - function map_abs_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - result = map(abs, tile) - ct.store(b, (pid, 1), result) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) .- 0.5f0 - b = CUDA.zeros(Float32, m, n) - - ct.launch(map_abs_kernel, m, a, b) - - @test Array(b) ≈ abs.(Array(a)) rtol=1e-5 -end - -@testset "mapreduce(abs, +, tile)" begin - function mapreduce_abs_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - sums = mapreduce(abs, +, tile; dims=2, init=0.0f0) - ct.store(b, pid, sums) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) .- 0.5f0 - b = CUDA.zeros(Float32, m) - - ct.launch(mapreduce_abs_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ sum(abs, a_cpu[i, :]) rtol=1e-3 - end -end - -@testset "mapreduce(x -> x * x, +, tile)" begin - function mapreduce_sq_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - sums = mapreduce(x -> x * x, +, tile; dims=2, init=0.0f0) - ct.store(b, pid, sums) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(mapreduce_sq_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ sum(x -> x^2, a_cpu[i, :]) rtol=1e-3 - end -end - -@testset "dropdims" begin - # Mean-subtract pattern: reduce row to get mean, dropdims the singleton, - # then broadcast-subtract from the original tile and store the column norms. - function dropdims_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) # (1, 128) - row_sum = sum(tile; dims=2) # (1, 1) - row_sum_1d = dropdims(row_sum; dims=2) # (1,) - ct.store(b, pid, row_sum_1d) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(dropdims_kernel, m, a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 - end -end - -end - -@testset "scan" begin - -@testset "1D cumsum (forward)" begin - function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - tile_size::ct.Constant{Int}) - bid = ct.bid(1) - tile = ct.load(a, bid, (tile_size[],)) - result = cumsum(tile; dims=1) - ct.store(b, bid, result) - return nothing - end - - sz = 32 - N = 1024 - a = CUDA.rand(Float32, N) - b = CUDA.zeros(Float32, N) - - ct.launch(cumsum_1d_kernel, cld(N, sz), a, b, ct.Constant(sz)) - - # Per-tile cumulative sum - a_cpu = Array(a) - b_cpu = Array(b) - a_reshaped = reshape(a_cpu, sz, :) - expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) - @test b_cpu ≈ vec(expected) rtol=1e-3 -end - -@testset "2D cumsum along axis 1" begin - function cumsum_2d_axis1_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (4, 8)) - result = cumsum(tile; dims=1) - ct.store(b, (pid, 1), result) - return nothing - end - - m, n = 32, 8 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m, n) - - ct.launch(cumsum_2d_axis1_kernel, cld(m, 4), a, b) - - a_cpu = Array(a) - b_cpu = Array(b) - # cumsum along dim 1 within each 4-row tile - for bid in 0:(cld(m, 4)-1) - rows = (bid*4+1):(bid*4+4) - for j in 1:n - @test b_cpu[rows, j] ≈ accumulate(+, a_cpu[rows, j]) rtol=1e-3 - end - end -end - -@testset "1D reverse cumsum" begin - function reverse_cumsum_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - tile_size::ct.Constant{Int}) - bid = ct.bid(1) - tile = ct.load(a, bid, (tile_size[],)) - result = cumsum(tile; dims=1, rev=true) - ct.store(b, bid, result) - return nothing - end - - sz = 32 - N = 1024 - a = CUDA.rand(Float32, N) - b = CUDA.zeros(Float32, N) - - ct.launch(reverse_cumsum_kernel, cld(N, sz), a, b, ct.Constant(sz)) - - a_cpu = Array(a) - b_cpu = Array(b) - a_reshaped = reshape(a_cpu, sz, :) - expected = mapslices(x -> reverse(accumulate(+, reverse(x))), a_reshaped, dims=1) - @test b_cpu ≈ vec(expected) rtol=1e-3 -end - -@testset "1D cumprod" begin - function cumprod_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - tile_size::ct.Constant{Int}) - bid = ct.bid(1) - tile = ct.load(a, bid, (tile_size[],)) - result = cumprod(tile; dims=1) - ct.store(b, bid, result) - return nothing - end - - sz = 32 - N = 1024 - # Use values close to 1.0 to avoid overflow/underflow - a = CuArray(rand(Float32, N) .* 0.1f0 .+ 0.95f0) - b = CUDA.zeros(Float32, N) - - ct.launch(cumprod_1d_kernel, cld(N, sz), a, b, ct.Constant(sz)) - - a_cpu = Array(a) - b_cpu = Array(b) - a_reshaped = reshape(a_cpu, sz, :) - expected = mapslices(x -> accumulate(*, x), a_reshaped, dims=1) - @test b_cpu ≈ vec(expected) rtol=1e-2 -end - -end - -@testset "scalar-tile operations" begin - -for (name, kernel_expr, cpu_expr) in [ - ("tile / scalar", :(tile / 2.0f0), :(Array(a) ./ 2.0f0)), - ("tile / integer", :(tile / 4), :(Array(a) ./ 4.0f0)), - ("scalar ./ tile", :(1.0f0 ./ tile), :(1.0f0 ./ Array(a))), - ("tile .+ scalar", :(tile .+ 3.5f0), :(Array(a) .+ 3.5f0)), - ("scalar .+ tile", :(2.5f0 .+ tile), :(2.5f0 .+ Array(a))), - ("tile .- scalar", :(tile .- 1.5f0), :(Array(a) .- 1.5f0)), - ("scalar .- tile", :(5.0f0 .- tile), :(5.0f0 .- Array(a))), - ("tile * scalar", :(tile * 2.5f0), :(Array(a) .* 2.5f0)), - ("scalar * tile", :(3.0f0 * tile), :(3.0f0 .* Array(a))), -] - sym = Symbol("scalar_tile_", replace(name, r"[^a-zA-Z0-9]" => "_")) - @eval @testset $name begin - function $sym(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - ct.store(b, pid, $kernel_expr) - return - end - a = CUDA.rand(Float32, 1024) .+ 0.1f0 - b = CUDA.zeros(Float32, 1024) - ct.launch($sym, cld(1024, 16), a, b) - @test Array(b) ≈ $cpu_expr - end -end - -end - -const _EXEC_TEST_GLOBAL_CONST = Float32(1 / log(2)) - -@testset "global constant arithmetic" begin - # Regression test for issue #77: scalar × global constant failed during codegen. - function global_const_arith_kernel(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - scale::Float32) - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - total_scale = scale * _EXEC_TEST_GLOBAL_CONST - ct.store(b, pid, tile .* total_scale) - return - end - - n = 1024 - tile_size = 16 - scale = 2.5f0 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(global_const_arith_kernel, cld(n, tile_size), a, b, scale) - - @test Array(b) ≈ Array(a) .* (scale * _EXEC_TEST_GLOBAL_CONST) -end - -@testset "tile broadcasting" begin - -@testset "1D broadcast: (1,) .+ (128,)" begin - # Test broadcasting a single-element tile to a larger tile - function broadcast_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - # Load scalar-like tile (1 element) - scalar_tile = ct.load(a, 1, (1,)) - # Load full tile (128 elements) - full_tile = ct.load(b, pid, (128,)) - # Broadcast add: (1,) .+ (128,) -> (128,) - result = scalar_tile .+ full_tile - ct.store(c, pid, result) - return - end - - n = 1024 - tile_size = 128 - a = CUDA.rand(Float32, 1) # Single element - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(broadcast_1d_kernel, cld(n, tile_size), a, b, c) - - # Each output element should be a[1] + b[i] - a_cpu = Array(a) - b_cpu = Array(b) - c_cpu = Array(c) - @test c_cpu ≈ a_cpu[1] .+ b_cpu -end - -@testset "2D broadcast: (1, 128) .+ (64, 1)" begin - # Test broadcasting 2D tiles with complementary shapes - function broadcast_2d_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - # Load row tile (1, 128) and column tile (64, 1) - row_tile = ct.load(a, (1, 1), (1, 128)) - col_tile = ct.load(b, (1, 1), (64, 1)) - # Broadcast add: (1, 128) .+ (64, 1) -> (64, 128) - result = row_tile .+ col_tile - ct.store(c, (1, 1), result) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, 1, n) # Row vector - b = CUDA.rand(Float32, m, 1) # Column vector - c = CUDA.zeros(Float32, m, n) - - ct.launch(broadcast_2d_kernel, 1, a, b, c) - - # Result should be outer sum: c[i,j] = a[1,j] + b[i,1] - a_cpu = Array(a) - b_cpu = Array(b) - c_cpu = Array(c) - expected = a_cpu .+ b_cpu # Julia broadcasting - @test c_cpu ≈ expected -end - -@testset "broadcast mul: (4, 1) .* (1, 8)" begin - function broadcast_mul_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - col_tile = ct.load(a, (1, 1), (4, 1)) - row_tile = ct.load(b, (1, 1), (1, 8)) - # Broadcast multiply: (4, 1) .* (1, 8) -> (4, 8) - result = col_tile .* row_tile - ct.store(c, (1, 1), result) - return - end - - a = CUDA.rand(Float32, 4, 1) - b = CUDA.rand(Float32, 1, 8) - c = CUDA.zeros(Float32, 4, 8) - - ct.launch(broadcast_mul_kernel, 1, a, b, c) - - a_cpu = Array(a) - b_cpu = Array(b) - c_cpu = Array(c) - expected = a_cpu .* b_cpu # Outer product - @test c_cpu ≈ expected -end - -@testset "broadcast sub: (128,) .- (1,)" begin - function broadcast_sub_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - full_tile = ct.load(a, pid, (128,)) - scalar_tile = ct.load(b, 1, (1,)) - # Broadcast subtract: (128,) .- (1,) -> (128,) - result = full_tile .- scalar_tile - ct.store(c, pid, result) - return - end - - n = 1024 - tile_size = 128 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, 1) # Single element - c = CUDA.zeros(Float32, n) - - ct.launch(broadcast_sub_kernel, cld(n, tile_size), a, b, c) - - a_cpu = Array(a) - b_cpu = Array(b) - c_cpu = Array(c) - @test c_cpu ≈ a_cpu .- b_cpu[1] -end - -@testset "broadcast div: (64, 128) ./ (1, 128)" begin - # Divide each row by a scaling vector - function broadcast_div_kernel(a::ct.TileArray{Float32,2}, scale::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - data = ct.load(a, (1, 1), (64, 128)) - scale_row = ct.load(scale, (1, 1), (1, 128)) - # Broadcast divide: (64, 128) ./ (1, 128) -> (64, 128) - result = data ./ scale_row - ct.store(c, (1, 1), result) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - scale = CUDA.rand(Float32, 1, n) .+ 0.1f0 # Non-zero scale factors - c = CUDA.zeros(Float32, m, n) - - ct.launch(broadcast_div_kernel, 1, a, scale, c) - - a_cpu = Array(a) - scale_cpu = Array(scale) - c_cpu = Array(c) - expected = a_cpu ./ scale_cpu - @test c_cpu ≈ expected -end - -@testset "explicit broadcast_to" begin - # Test ct.broadcast_to() for explicit shape broadcasting - function broadcast_to_kernel(a::ct.TileArray{Float32,2}, c::ct.TileArray{Float32,2}) - # Load a row tile (1, 128) - row_tile = ct.load(a, (1, 1), (1, 128)) - # Explicitly broadcast to (64, 128) - expanded = ct.broadcast_to(row_tile, (64, 128)) - ct.store(c, (1, 1), expanded) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, 1, n) - c = CUDA.zeros(Float32, m, n) - - ct.launch(broadcast_to_kernel, 1, a, c) - - a_cpu = Array(a) - c_cpu = Array(c) - # Each row of c should equal the single row of a - for i in 1:m - @test c_cpu[i, :] ≈ a_cpu[1, :] - end -end - -end - -@testset "comparison operations" begin - -for (name, op1, op2) in [ - ("< and >", :<, :>), - ("<= and >=", :<=, :>=), -] - sym = Symbol("cmp_", replace(name, r"[^a-zA-Z0-9]" => "_")) - @eval @testset "float $($name)" begin - function $sym(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - out1::ct.TileArray{Float32,1}, out2::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - ct.store(out1, pid, ct.where(broadcast($op1, ta, tb), 1.0f0, 0.0f0)) - ct.store(out2, pid, ct.where(broadcast($op2, ta, tb), 1.0f0, 0.0f0)) - return - end - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - out1 = CUDA.zeros(Float32, n) - out2 = CUDA.zeros(Float32, n) - ct.launch($sym, cld(n, 16), a, b, out1, out2) - @test Array(out1) ≈ Float32.(broadcast($op1, Array(a), Array(b))) - @test Array(out2) ≈ Float32.(broadcast($op2, Array(a), Array(b))) - end -end - -@testset "float .== and .!=" begin - function cmp_eq_ne_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - out_eq::ct.TileArray{Float32,1}, out_ne::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - ct.store(out_eq, pid, ct.where(ta .== tb, 1.0f0, 0.0f0)) - ct.store(out_ne, pid, ct.where(ta .!= tb, 1.0f0, 0.0f0)) - return - end - - n = 1024 - # Use integer-valued floats so equality is meaningful - a = CUDA.fill(Float32(1), n) - b = CUDA.fill(Float32(1), n) - # Set half to different values - CUDA.@allowscalar b[1:512] .= 2.0f0 - out_eq = CUDA.zeros(Float32, n) - out_ne = CUDA.zeros(Float32, n) - - ct.launch(cmp_eq_ne_kernel, cld(n, 16), a, b, out_eq, out_ne) - - @test Array(out_eq) ≈ Float32.(Array(a) .== Array(b)) - @test Array(out_ne) ≈ Float32.(Array(a) .!= Array(b)) -end - -@testset "tile vs scalar comparison" begin - function cmp_scalar_kernel(a::ct.TileArray{Float32,1}, - out::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - ct.store(out, pid, ct.where(ta .> 0.5f0, 1.0f0, 0.0f0)) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - out = CUDA.zeros(Float32, n) - - ct.launch(cmp_scalar_kernel, cld(n, 16), a, out) - - @test Array(out) ≈ Float32.(Array(a) .> 0.5f0) -end - -end - -@testset "power operations" begin - -@testset "tile .^ tile" begin - function pow_tt_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - ct.store(c, pid, ta .^ tb) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) .+ 0.5f0 # Ensure positive base - b = CUDA.rand(Float32, n) .+ 0.5f0 - c = CUDA.zeros(Float32, n) - - ct.launch(pow_tt_kernel, cld(n, 16), a, b, c) - - @test Array(c) ≈ Array(a) .^ Array(b) rtol=1e-4 -end - -@testset "tile .^ scalar" begin - function pow_ts_kernel(a::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - ct.store(c, pid, ta .^ 2.0f0) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) .+ 0.1f0 - c = CUDA.zeros(Float32, n) - - ct.launch(pow_ts_kernel, cld(n, 16), a, c) - - @test Array(c) ≈ Array(a) .^ 2.0f0 rtol=1e-4 -end - -end - -@testset "atomic operations" begin - -@testset "atomic_add Int" begin - # Test atomic_add with Int: each thread block adds 1 to a counter - function atomic_add_kernel(counters::ct.TileArray{Int,1}) - bid = ct.bid(1) - ct.atomic_add(counters, 1, 1; - memory_order=ct.MemoryOrder.AcqRel) - return - end - - n_blocks = 1000 - counters = CUDA.zeros(Int, 1) - - ct.launch(atomic_add_kernel, n_blocks, counters) - - result = Array(counters)[1] - @test result == n_blocks -end - -@testset "atomic_add Float32" begin - # Test atomic_add with Float32 - function atomic_add_f32_kernel(out::ct.TileArray{Float32,1}, val::ct.Constant{Float32}) - bid = ct.bid(1) - ct.atomic_add(out, 1, val[]; - memory_order=ct.MemoryOrder.AcqRel) - return - end - - n_blocks = 100 - out = CUDA.zeros(Float32, 1) - val = 1.5f0 - - ct.launch(atomic_add_f32_kernel, n_blocks, out, ct.Constant(val)) - - result = Array(out)[1] - @test result ≈ n_blocks * val rtol=1e-3 -end - -@testset "atomic_xchg" begin - # Test atomic_xchg: each thread exchanges, last one wins - function atomic_xchg_kernel(arr::ct.TileArray{Int,1}) - bid = ct.bid(1) - ct.atomic_xchg(arr, 1, bid + 1; - memory_order=ct.MemoryOrder.AcqRel) - return - end - - n_blocks = 10 - arr = CUDA.zeros(Int, 1) - - ct.launch(atomic_xchg_kernel, n_blocks, arr) - - # Result should be one of 1..n_blocks (whichever thread ran last) - result = Array(arr)[1] - @test 1 <= result <= n_blocks -end - -@testset "atomic_cas success" begin - # Test atomic_cas: only one thread should succeed in setting 0->1 - function atomic_cas_kernel(locks::ct.TileArray{Int,1}, success_count::ct.TileArray{Int,1}) - bid = ct.bid(1) - # Try to acquire lock (0 -> 1) - old = ct.atomic_cas(locks, 1, 0, 1; - memory_order=ct.MemoryOrder.AcqRel) - # If we got old=0, we succeeded - # Use atomic_add to count successes (returns a tile, so comparison works) - # Actually simpler: just increment success_count if old was 0 - # But we can't do conditionals easily here, so let's just verify lock changes - return - end - - locks = CUDA.zeros(Int, 1) - success_count = CUDA.zeros(Int, 1) - - ct.launch(atomic_cas_kernel, 100, locks, success_count) - - # Lock should be set to 1 (at least one thread succeeded) - lock_val = Array(locks)[1] - @test lock_val == 1 -end - -@testset "spinlock with token ordering" begin - # Test that token threading enforces memory ordering in spinlock patterns - function spinlock_kernel(result::ct.TileArray{Float32,1}, lock::ct.TileArray{Int,1}) - bid = ct.bid(1) - val = ct.full((1,), 1.0f0, Float32) - - # Spin until we acquire the lock (CAS returns old value, 0 means we got it) - while ct.atomic_cas(lock, 1, 0, 1; - memory_order=ct.MemoryOrder.Acquire) == 1 - end - - # Critical section: load, increment, store - # With proper token threading, these are ordered after the acquire - current = ct.load(result, 1, (1,)) - updated = current .+ val - ct.store(result, 1, updated) - - # Release the lock - ct.atomic_xchg(lock, 1, 0; - memory_order=ct.MemoryOrder.Release) - return - end - - n_blocks = 50 # Use fewer blocks to reduce test time - result = CUDA.zeros(Float32, 1) - lock = CUDA.zeros(Int, 1) - - ct.launch(spinlock_kernel, n_blocks, result, lock) - - # Each block should have added 1.0 to the result - final_result = Array(result)[1] - @test final_result == Float32(n_blocks) -end - -@testset "explicit memory ordering kwargs" begin - # Test that explicit memory_order kwargs work correctly - function explicit_ordering_kernel(result::ct.TileArray{Float32,1}, lock::ct.TileArray{Int,1}) - bid = ct.bid(1) - val = ct.full((1,), 1.0f0, Float32) - - # Spin until we acquire the lock - use explicit Acquire ordering - while ct.atomic_cas(lock, 1, 0, 1; - memory_order=ct.MemoryOrder.Acquire) == 1 - end - - # Critical section - current = ct.load(result, 1, (1,)) - updated = current .+ val - ct.store(result, 1, updated) - - # Release the lock - use explicit Release ordering - ct.atomic_xchg(lock, 1, 0; memory_order=ct.MemoryOrder.Release) - return - end - - n_blocks = 50 - result = CUDA.zeros(Float32, 1) - lock = CUDA.zeros(Int, 1) - - ct.launch(explicit_ordering_kernel, n_blocks, result, lock) - - final_result = Array(result)[1] - @test final_result == Float32(n_blocks) -end - -@testset "atomic_add with explicit kwargs" begin - # Test atomic_add with explicit memory ordering - function explicit_add_kernel(counters::ct.TileArray{Int,1}) - bid = ct.bid(1) - ct.atomic_add(counters, 1, 1; - memory_order=ct.MemoryOrder.Relaxed, - memory_scope=ct.MemScope.Device) - return - end - - n_blocks = 100 - counters = CUDA.zeros(Int, 1) - - ct.launch(explicit_add_kernel, n_blocks, counters) - - result = Array(counters)[1] - @test result == n_blocks -end - -@testset "1D gather - simple" begin - # Simple 1D gather: copy first 16 elements using gather - function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) - pid = ct.bid(1) - # Simple indices 0..15 - indices = ct.arange((16,), Int) - # Gather from source - tile = ct.gather(src, indices) - # Store to destination - ct.store(dst, pid, tile) - return - end - - n = 16 - src = CUDA.rand(Float32, n) - dst = CUDA.zeros(Float32, n) - - ct.launch(gather_simple_kernel, 1, src, dst) - - @test Array(dst) ≈ Array(src) -end - -@testset "1D scatter - simple" begin - # Simple 1D scatter: write first 16 elements using scatter - function scatter_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) - pid = ct.bid(1) - # Load from source - tile = ct.load(src, pid, (16,)) - # Simple indices 0..15 - indices = ct.arange((16,), Int) - # Scatter to destination - ct.scatter(dst, indices, tile) - return - end - - n = 16 - src = CUDA.rand(Float32, n) - dst = CUDA.zeros(Float32, n) - - ct.launch(scatter_simple_kernel, 1, src, dst) - - @test Array(dst) ≈ Array(src) -end - -end - -@testset "Entry Hints" begin - -@testset "launch with num_ctas" begin - function vadd_kernel_num_ctas(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - ct.store(c, pid, ta + tb) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2) - - @test Array(c) ≈ ones(Float32, n) .* 3 -end - -@testset "launch with occupancy" begin - function vadd_kernel_occupancy(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - ct.store(c, pid, ta + tb) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_kernel_occupancy, 64, a, b, c; occupancy=4) - - @test Array(c) ≈ ones(Float32, n) .* 3 -end - -@testset "launch with both hints" begin - function vadd_kernel_both_hints(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - ct.store(c, pid, ta + tb) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8) - - @test Array(c) ≈ ones(Float32, n) .* 3 -end - -end - -@testset "Load / Store Optimization Hints" begin - -@testset "load with latency hint" begin - function vadd_with_load_latency(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,); latency=5) - tb = ct.load(b, pid, (16,); latency=3) - ct.store(c, pid, ta + tb) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_with_load_latency, 64, a, b, c) - - @test Array(c) ≈ ones(Float32, n) .* 3 -end - -@testset "load with allow_tma=false" begin - function vadd_no_tma(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,); allow_tma=false) - tb = ct.load(b, pid, (16,); allow_tma=false) - ct.store(c, pid, ta + tb) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_no_tma, 64, a, b, c) - - @test Array(c) ≈ ones(Float32, n) .* 3 -end - -@testset "load with both hints" begin - function vadd_both_load_hints(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,); latency=7, allow_tma=false) - tb = ct.load(b, pid, (16,); latency=4, allow_tma=true) - ct.store(c, pid, ta + tb) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_both_load_hints, 64, a, b, c) - - @test Array(c) ≈ ones(Float32, n) .* 3 -end - -@testset "store with latency hint" begin - function copy_with_store_latency(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - ct.store(b, pid, ta; latency=2) - return nothing - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(copy_with_store_latency, 64, a, b) - - @test Array(b) ≈ Array(a) -end - -@testset "store with allow_tma=false" begin - function copy_no_tma_store(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - ct.store(b, pid, ta; allow_tma=false) - return nothing - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(copy_no_tma_store, 64, a, b) - - @test Array(b) ≈ Array(a) -end - -@testset "different hints on load and store" begin - function vadd_mixed_hints(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - # Load with high latency, no TMA - ta = ct.load(a, pid, (16,); latency=8, allow_tma=false) - tb = ct.load(b, pid, (16,); latency=6, allow_tma=false) - # Store with low latency, allow TMA - ct.store(c, pid, ta + tb; latency=2, allow_tma=true) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.zeros(Float32, n) - - ct.launch(vadd_mixed_hints, 64, a, b, c) - - @test Array(c) ≈ ones(Float32, n) .* 3 -end - -@testset "2D matmul with hints" begin - function matmul_with_hints(a::ct.TileArray{Float32,2}, - b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - bidx = ct.bid(1) - bidy = ct.bid(2) - # Load with latency hints - tile_a = ct.load(a, (bidx, 1), (32, 16); latency=5) - tile_b = ct.load(b, (1, bidy), (16, 32); latency=5) - result = tile_a * tile_b - # Store with latency hint - ct.store(c, (bidx, bidy), result; latency=3) - return nothing - end - - M, K, N = 64, 16, 64 - a = CUDA.rand(Float32, M, K) - b = CUDA.rand(Float32, K, N) - c = CUDA.zeros(Float32, M, N) - - grid_x = cld(M, 32) - grid_y = cld(N, 32) - ct.launch(matmul_with_hints, (grid_x, grid_y, 1), a, b, c) - - - # Verify against CPU reference - a_cpu = Array(a) - b_cpu = Array(b) - c_cpu = Array(c) - c_ref = a_cpu * b_cpu - - @test c_cpu ≈ c_ref -end - -@testset "reduction with hints" begin - function reduce_with_hints(a::ct.TileArray{Float32,2}, - b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - # Load with hints - tile = ct.load(a, (pid, 1), (1, 128); latency=6, allow_tma=false) - sums = sum(tile; dims=2) - # Store with hints - ct.store(b, pid, sums; latency=2) - return nothing - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(reduce_with_hints, m, a, b) - - - # Each row should be summed - a_cpu = Array(a) - b_cpu = Array(b) - for i in 1:m - @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 - end -end - -@testset "1D reduce operations" begin - TILE_SIZE = 32 - N = 1024 - - function reduce_sum_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, - tileSz::ct.Constant{Int}) where {T} - ct.store(b, ct.bid(1), sum(ct.load(a, ct.bid(1), (tileSz[],)); dims=1)) - return nothing - end - - function reduce_max_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, - tileSz::ct.Constant{Int}) where {T} - ct.store(b, ct.bid(1), maximum(ct.load(a, ct.bid(1), (tileSz[],)); dims=1)) - return nothing - end - - function cpu_reduce(a_reshaped::AbstractArray{T}, op) where {T} - result = mapslices(op, a_reshaped, dims=1)[:] - # For unsigned sum, apply mask to handle overflow - if T <: Unsigned && op === sum - result .= result .& typemax(T) - end - return result - end - - TEST_TYPES = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64] - - TEST_OPS = [ - (reduce_sum_1d, sum), - (reduce_max_1d, maximum), - ] - - @testset "Type: $elType, Operation: $gpu_kernel" for elType in TEST_TYPES, (gpu_kernel, cpu_op) in TEST_OPS - # Generate input data with type-appropriate ranges to avoid overflow - if elType == UInt8 - a_gpu = CuArray{UInt8}(rand(UInt8(0):UInt8(7), N)) - elseif elType == Int8 - a_gpu = CuArray{Int8}(rand(-3:3, N)) - elseif elType == Int16 - a_gpu = CuArray{Int16}(rand(-800:800, N)) - elseif elType == UInt16 - a_gpu = CuArray{UInt16}(rand(1:2000, N)) - elseif elType <: Integer && elType <: Signed - a_gpu = CuArray{elType}(rand(-1000:1000, N)) - else - a_gpu = CUDA.rand(elType, N) - end - b_gpu = CUDA.zeros(elType, cld(N, TILE_SIZE)) - - ct.launch(gpu_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) - - a_cpu = Array(a_gpu) - b_cpu = Array(b_gpu) - a_reshaped = reshape(a_cpu, TILE_SIZE, :) - cpu_result = cpu_reduce(a_reshaped, cpu_op) - - if elType <: AbstractFloat - @test b_cpu ≈ cpu_result rtol=1e-3 - else - @test b_cpu == cpu_result - end - end -end - -@testset "1D scan (cumsum)" begin - TILE_SIZE = 32 - N = 1024 - - function scan_kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) where {T} - ct.store(b, ct.bid(1), cumsum(ct.load(a, ct.bid(1), (tileSz[],)); dims=1)) - return nothing - end - - TEST_TYPES = [Float16, Float32, Float64, Int32, Int64, UInt32, UInt64] - - @testset "Type: $elType" for elType in TEST_TYPES - # Type-appropriate input generation (small values to avoid overflow in cumsum) - if elType <: Integer && elType <: Signed - a_gpu = CuArray{elType}(rand(elType(-3):elType(3), N)) - elseif elType <: Integer - a_gpu = CuArray{elType}(rand(elType(0):elType(7), N)) - else - a_gpu = CUDA.rand(elType, N) - end - b_gpu = CUDA.zeros(elType, N) - - ct.launch(scan_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) - - a_cpu = Array(a_gpu) - b_cpu = Array(b_gpu) - - # CPU reference: per-tile cumulative sum - a_reshaped = reshape(a_cpu, TILE_SIZE, :) - expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) - - if elType <: AbstractFloat - @test b_cpu ≈ vec(expected) rtol=1e-3 - else - @test b_cpu == vec(expected) - end - end -end - -@testset "any / all" begin - TILE_SIZE = 16 - - function any_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Int32,1}, - tileSz::ct.Constant{Int}) - tile = ct.load(a, ct.bid(1), (tileSz[],)) - mask = tile .> 0.0f0 - result = any(mask; dims=1) - ct.store(b, ct.bid(1), convert(ct.Tile{Int32}, result)) - return nothing - end - - function all_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Int32,1}, - tileSz::ct.Constant{Int}) - tile = ct.load(a, ct.bid(1), (tileSz[],)) - mask = tile .> 0.0f0 - result = all(mask; dims=1) - ct.store(b, ct.bid(1), convert(ct.Tile{Int32}, result)) - return nothing - end - - N = 64 - n_blocks = cld(N, TILE_SIZE) - - # All positive → any=true, all=true - a_pos = CUDA.ones(Float32, N) - b_any = CUDA.zeros(Int32, n_blocks) - b_all = CUDA.zeros(Int32, n_blocks) - ct.launch(any_kernel, n_blocks, a_pos, b_any, ct.Constant(TILE_SIZE)) - ct.launch(all_kernel, n_blocks, a_pos, b_all, ct.Constant(TILE_SIZE)) - @test all(Array(b_any) .== 1) - @test all(Array(b_all) .== 1) - - # All negative → any=false, all=false - a_neg = CUDA.fill(Float32(-1), N) - b_any = CUDA.zeros(Int32, n_blocks) - b_all = CUDA.zeros(Int32, n_blocks) - ct.launch(any_kernel, n_blocks, a_neg, b_any, ct.Constant(TILE_SIZE)) - ct.launch(all_kernel, n_blocks, a_neg, b_all, ct.Constant(TILE_SIZE)) - @test all(Array(b_any) .== 0) - @test all(Array(b_all) .== 0) - - # Mixed → any=true, all=false (first element positive, rest negative) - a_mix = CUDA.fill(Float32(-1), N) - # Set first element of each tile to positive - a_mix_cpu = Array(a_mix) - for i in 1:TILE_SIZE:N - a_mix_cpu[i] = 1.0f0 - end - a_mix = CuArray(a_mix_cpu) - b_any = CUDA.zeros(Int32, n_blocks) - b_all = CUDA.zeros(Int32, n_blocks) - ct.launch(any_kernel, n_blocks, a_mix, b_any, ct.Constant(TILE_SIZE)) - ct.launch(all_kernel, n_blocks, a_mix, b_all, ct.Constant(TILE_SIZE)) - @test all(Array(b_any) .== 1) - @test all(Array(b_all) .== 0) -end - -@testset "count" begin - TILE_SIZE = 16 - - function count_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Int32,1}, - tileSz::ct.Constant{Int}) - tile = ct.load(a, ct.bid(1), (tileSz[],)) - result = count(tile .> 0.0f0; dims=1) - ct.store(b, ct.bid(1), result) - return nothing - end - - N = 64 - n_blocks = cld(N, TILE_SIZE) - - # Known pattern: 3 positive per tile - a_cpu = fill(Float32(-1), N) - for i in 1:TILE_SIZE:N - a_cpu[i] = 1.0f0 - a_cpu[i+1] = 2.0f0 - a_cpu[i+2] = 3.0f0 - end - a = CuArray(a_cpu) - b = CUDA.zeros(Int32, n_blocks) - - ct.launch(count_kernel, n_blocks, a, b, ct.Constant(TILE_SIZE)) - - @test all(Array(b) .== 3) -end - -@testset "argmax / argmin" begin - TILE_SIZE = 16 - - function argmax_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Int32,2}) - tile = ct.load(a, ct.bid(1), (4, 16)) - result = argmax(tile; dims=2) - ct.store(b, ct.bid(1), result) - return nothing - end - - function argmin_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Int32,2}) - tile = ct.load(a, ct.bid(1), (4, 16)) - result = argmin(tile; dims=2) - ct.store(b, ct.bid(1), result) - return nothing - end - - m, n = 4, 16 - # Create data with known argmax/argmin positions - a_cpu = zeros(Float32, m, n) - for row in 1:m - for col in 1:n - a_cpu[row, col] = Float32(col) # max at col 16, min at col 1 - end - end - a = CuArray(a_cpu) - b_max = CUDA.zeros(Int32, m, 1) - b_min = CUDA.zeros(Int32, m, 1) - - ct.launch(argmax_kernel, 1, a, b_max) - ct.launch(argmin_kernel, 1, a, b_min) - - b_max_cpu = Array(b_max) - b_min_cpu = Array(b_min) - - # argmax should return 16 (1-indexed) for all rows - @test all(b_max_cpu .== 16) - # argmin should return 1 (1-indexed) for all rows - @test all(b_min_cpu .== 1) - - # Test with random data - a_rand = CUDA.rand(Float32, m, n) - b_max_rand = CUDA.zeros(Int32, m, 1) - b_min_rand = CUDA.zeros(Int32, m, 1) - - ct.launch(argmax_kernel, 1, a_rand, b_max_rand) - ct.launch(argmin_kernel, 1, a_rand, b_min_rand) - - a_rand_cpu = Array(a_rand) - # Compare with CPU argmax/argmin (Julia returns CartesianIndex, extract column) - for row in 1:m - expected_max = argmax(a_rand_cpu[row, :]) - expected_min = argmin(a_rand_cpu[row, :]) - @test Array(b_max_rand)[row, 1] == expected_max - @test Array(b_min_rand)[row, 1] == expected_min - end -end - -@testset "transpose with hints" begin - function transpose_with_hints(x::ct.TileArray{Float32,2}, - y::ct.TileArray{Float32,2}) - bidx = ct.bid(1) - bidy = ct.bid(2) - # Load with high latency - tile = ct.load(x, (bidx, bidy), (32, 32); latency=9) - transposed = transpose(tile) - # Store with lower latency - ct.store(y, (bidy, bidx), transposed; latency=4) - return nothing - end - - m, n = 256, 128 - tile_size = 32 - x = CUDA.rand(Float32, m, n) - y = CUDA.zeros(Float32, n, m) - - ct.launch(transpose_with_hints, (cld(m, tile_size), cld(n, tile_size)), x, y) - - - @test Array(y) ≈ transpose(Array(x)) -end - -@testset "complex kernel with multiple loads/stores with hints" begin - function complex_hints_kernel(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}, - d::ct.TileArray{Float32,1}) - pid = ct.bid(1) - # Multiple loads with different hints - ta = ct.load(a, pid, (16,); latency=10, allow_tma=false) - tb = ct.load(b, pid, (16,); latency=5, allow_tma=true) - tc = ct.load(c, pid, (16,); latency=7) - - # Compute result - result = ta + tb + tc - - # Store with hint - ct.store(d, pid, result; latency=1, allow_tma=false) - return nothing - end - - n = 1024 - a = CUDA.ones(Float32, n) - b = CUDA.ones(Float32, n) .* 2 - c = CUDA.ones(Float32, n) .* 3 - d = CUDA.zeros(Float32, n) - - ct.launch(complex_hints_kernel, 64, a, b, c, d) - - @test Array(d) ≈ ones(Float32, n) .* 6 -end - -@testset "hints with Float64" begin - function vadd_f64_hints(a::ct.TileArray{Float64,1}, - b::ct.TileArray{Float64,1}, - c::ct.TileArray{Float64,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,); latency=8) - tb = ct.load(b, pid, (16,); latency=8) - ct.store(c, pid, ta + tb; latency=4) - return nothing - end - - n = 1024 - a = CUDA.rand(Float64, n) - b = CUDA.rand(Float64, n) - c = CUDA.zeros(Float64, n) - - ct.launch(vadd_f64_hints, 64, a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "hints with Float16" begin - function vadd_f16_hints(a::ct.TileArray{Float16,1}, - b::ct.TileArray{Float16,1}, - c::ct.TileArray{Float16,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,); latency=3, allow_tma=false) - tb = ct.load(b, pid, (16,); latency=3, allow_tma=false) - ct.store(c, pid, ta + tb; latency=1) - return nothing - end - - n = 1024 - a = CUDA.rand(Float16, n) - b = CUDA.rand(Float16, n) - c = CUDA.zeros(Float16, n) - - ct.launch(vadd_f16_hints, 64, a, b, c) - - @test Array(c) ≈ Array(a) + Array(b) -end - -@testset "boundary latency values" begin - function test_boundary_latency(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - # Min and max valid latency values - ta = ct.load(a, pid, (16,); latency=1) - ct.store(b, pid, ta; latency=10) - return nothing - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(test_boundary_latency, 64, a, b) - - @test Array(b) ≈ Array(a) -end - -# Pointer-based operations (gather/scatter) with latency hints -@testset "gather with latency hint" begin - function gather_with_latency(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - base = (pid - 1) * 16 - indices = base .+ ct.arange((16,), Int32) - tile = ct.gather(a, indices; latency=5) - ct.store(b, pid, tile) - return nothing - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(gather_with_latency, 64, a, b) - - @test Array(b) ≈ Array(a) -end - -@testset "scatter with latency hint" begin - function scatter_with_latency(a::ct.TileArray{Float32,1}, - b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, pid, (16,)) - base = (pid - 1) * 16 - indices = base .+ ct.arange((16,), Int32) - ct.scatter(b, indices, tile; latency=3) - return nothing - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(scatter_with_latency, 64, a, b) - - @test Array(b) ≈ Array(a) -end - -end - -@testset "where / ifelse broadcasting" begin - -@testset "where same-shape" begin - function where_same_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - mask = ta .> tb - result = ct.where(mask, ta, tb) - ct.store(c, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(where_same_kernel, cld(n, 16), a, b, c) - - @test Array(c) ≈ ifelse.(Array(a) .> Array(b), Array(a), Array(b)) rtol=1e-5 -end - -@testset "where with scalar y" begin - function where_scalar_y_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - mask = ta .> 0.5f0 - result = ct.where(mask, ta, 0.0f0) - ct.store(b, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(where_scalar_y_kernel, cld(n, 16), a, b) - - @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, Array(a), 0.0f0) rtol=1e-5 -end - -@testset "where with scalar x" begin - function where_scalar_x_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - mask = ta .> 0.5f0 - result = ct.where(mask, 1.0f0, ta) - ct.store(b, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(where_scalar_x_kernel, cld(n, 16), a, b) - - @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, 1.0f0, Array(a)) rtol=1e-5 -end - -@testset "where with broadcasting" begin - function where_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) - mask = ct.load(a, (1, 1), (1, 128)) # (1, 128) mask - tile = ct.load(a, (1, 1), (64, 128)) # (64, 128) tile - result = ct.where(mask .> 0.5f0, tile, 0.0f0) - ct.store(b, (1, 1), result) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m, n) - - ct.launch(where_broadcast_kernel, 1, a, b) - - a_cpu = Array(a) - mask_cpu = a_cpu[1:1, :] .> 0.5f0 - expected = ifelse.(mask_cpu, a_cpu, 0.0f0) - @test Array(b) ≈ expected rtol=1e-5 -end - -@testset "ifelse. same-shape" begin - function ifelse_same_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - result = ifelse.(ta .> tb, ta, tb) - ct.store(c, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(ifelse_same_kernel, cld(n, 16), a, b, c) - - @test Array(c) ≈ ifelse.(Array(a) .> Array(b), Array(a), Array(b)) rtol=1e-5 -end - -@testset "ifelse. with scalar y" begin - function ifelse_scalar_y_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - result = ifelse.(ta .> 0.5f0, ta, 0.0f0) - ct.store(b, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(ifelse_scalar_y_kernel, cld(n, 16), a, b) - - @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, Array(a), 0.0f0) rtol=1e-5 -end - -@testset "ifelse. with both scalars" begin - function ifelse_both_scalar_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - result = ifelse.(ta .> 0.5f0, 1.0f0, 0.0f0) - ct.store(b, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(ifelse_both_scalar_kernel, cld(n, 16), a, b) - - @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, 1.0f0, 0.0f0) rtol=1e-5 -end - -@testset "ifelse. with broadcasting shapes" begin - function ifelse_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) - col_mask = ct.load(a, (1, 1), (64, 1)) # (64, 1) column - tile = ct.load(a, (1, 1), (64, 128)) # (64, 128) tile - result = ifelse.(col_mask .> 0.5f0, tile, 0.0f0) - ct.store(b, (1, 1), result) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.zeros(Float32, m, n) - - ct.launch(ifelse_broadcast_kernel, 1, a, b) - - a_cpu = Array(a) - mask_cpu = a_cpu[:, 1:1] .> 0.5f0 - expected = ifelse.(mask_cpu, a_cpu, 0.0f0) - @test Array(b) ≈ expected rtol=1e-5 -end - -end # where / ifelse broadcasting - -@testset "max / min broadcasting" begin - -@testset "max. float tile-tile" begin - function max_float_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - result = max.(ta, tb) - ct.store(c, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(max_float_kernel, cld(n, 16), a, b, c) - - @test Array(c) ≈ max.(Array(a), Array(b)) rtol=1e-5 -end - -@testset "min. float tile-tile" begin - function min_float_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - result = min.(ta, tb) - ct.store(c, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(min_float_kernel, cld(n, 16), a, b, c) - - @test Array(c) ≈ min.(Array(a), Array(b)) rtol=1e-5 -end - -@testset "max. float tile-scalar (ReLU)" begin - function relu_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - result = max.(ta, 0.0f0) - ct.store(b, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) .- 0.5f0 # Mix of positive and negative - b = CUDA.zeros(Float32, n) - - ct.launch(relu_kernel, cld(n, 16), a, b) - - @test Array(b) ≈ max.(Array(a), 0.0f0) rtol=1e-5 -end - -@testset "min. float tile-scalar" begin - function clamp_max_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - result = min.(ta, 1.0f0) - ct.store(b, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) .* 2.0f0 # Values in [0, 2] - b = CUDA.zeros(Float32, n) - - ct.launch(clamp_max_kernel, cld(n, 16), a, b) - - @test Array(b) ≈ min.(Array(a), 1.0f0) rtol=1e-5 -end - -@testset "max. integer tile-tile (signed)" begin - function max_int_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, - c::ct.TileArray{Int32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - result = max.(ta, tb) - ct.store(c, pid, result) - return - end - - n = 1024 - a = CuArray(rand(Int32(-100):Int32(100), n)) - b = CuArray(rand(Int32(-100):Int32(100), n)) - c = CUDA.zeros(Int32, n) - - ct.launch(max_int_kernel, cld(n, 16), a, b, c) - - @test Array(c) == max.(Array(a), Array(b)) -end - -@testset "min. integer tile-tile (signed)" begin - function min_int_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, - c::ct.TileArray{Int32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - result = min.(ta, tb) - ct.store(c, pid, result) - return - end - - n = 1024 - a = CuArray(rand(Int32(-100):Int32(100), n)) - b = CuArray(rand(Int32(-100):Int32(100), n)) - c = CUDA.zeros(Int32, n) - - ct.launch(min_int_kernel, cld(n, 16), a, b, c) - - @test Array(c) == min.(Array(a), Array(b)) -end - -@testset "max. broadcasting: (64,1) vs (1,128)" begin - function max_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - c::ct.TileArray{Float32,2}) - col_tile = ct.load(a, (1, 1), (64, 1)) - row_tile = ct.load(b, (1, 1), (1, 128)) - result = max.(col_tile, row_tile) - ct.store(c, (1, 1), result) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, 1) - b = CUDA.rand(Float32, 1, n) - c = CUDA.zeros(Float32, m, n) - - ct.launch(max_broadcast_kernel, 1, a, b, c) - - @test Array(c) ≈ max.(Array(a), Array(b)) rtol=1e-5 -end - -end # max / min broadcasting - -@testset "fma broadcasting" begin - -@testset "fma. same-shape" begin - function fma_same_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}, d::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - tc = ct.load(c, pid, (16,)) - result = fma.(ta, tb, tc) - ct.store(d, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.rand(Float32, n) - d = CUDA.zeros(Float32, n) - - ct.launch(fma_same_kernel, cld(n, 16), a, b, c, d) - - @test Array(d) ≈ fma.(Array(a), Array(b), Array(c)) rtol=1e-5 -end - -@testset "fma. with scalar c" begin - function fma_scalar_c_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - result = fma.(ta, tb, 1.0f0) - ct.store(c, pid, result) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - ct.launch(fma_scalar_c_kernel, cld(n, 16), a, b, c) - - @test Array(c) ≈ fma.(Array(a), Array(b), 1.0f0) rtol=1e-5 -end - -@testset "fma. with broadcasting bias" begin - function fma_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, - bias::ct.TileArray{Float32,2}, c::ct.TileArray{Float32,2}) - ta = ct.load(a, (1, 1), (64, 128)) - tb = ct.load(b, (1, 1), (64, 128)) - tbias = ct.load(bias, (1, 1), (1, 128)) # (1, 128) bias row - result = fma.(ta, tb, tbias) - ct.store(c, (1, 1), result) - return - end - - m, n = 64, 128 - a = CUDA.rand(Float32, m, n) - b = CUDA.rand(Float32, m, n) - bias = CUDA.rand(Float32, 1, n) - c = CUDA.zeros(Float32, m, n) - - ct.launch(fma_broadcast_kernel, 1, a, b, bias, c) - - @test Array(c) ≈ fma.(Array(a), Array(b), Array(bias)) rtol=1e-5 -end - -end # fma broadcasting - -@testset "multi-arg map" begin - @testset "binary map(+, ...)" begin - function map_add_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - ct.store(c, pid, map(+, ta, tb)) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - ct.launch(map_add_kernel, cld(n, 16), a, b, c) - @test Array(c) ≈ Array(a) + Array(b) - end - - @testset "ternary map(fma, ...)" begin - function map_fma_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}, d::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - tc = ct.load(c, pid, (16,)) - ct.store(d, pid, map(fma, ta, tb, tc)) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.rand(Float32, n) - d = CUDA.zeros(Float32, n) - ct.launch(map_fma_kernel, cld(n, 16), a, b, c, d) - @test Array(d) ≈ fma.(Array(a), Array(b), Array(c)) - end - - @testset "nested broadcast a .+ b .* c" begin - function nested_bc_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}, d::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - tc = ct.load(c, pid, (16,)) - ct.store(d, pid, ta .+ tb .* tc) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.rand(Float32, n) - d = CUDA.zeros(Float32, n) - ct.launch(nested_bc_kernel, cld(n, 16), a, b, c, d) - @test Array(d) ≈ Array(a) .+ Array(b) .* Array(c) - end - - @testset "ifelse broadcast" begin - function ifelse_bc_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, pid, (16,)) - tb = ct.load(b, pid, (16,)) - mask = ta .> tb - ct.store(c, pid, ifelse.(mask, ta, tb)) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - ct.launch(ifelse_bc_kernel, cld(n, 16), a, b, c) - @test Array(c) ≈ max.(Array(a), Array(b)) - end -end - -@testset "invalidations" begin - -@testset "redefine kernel" begin - mod = @eval module $(gensym()) - import cuTile as ct - function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, (pid,), (16,)) - tb = ct.load(b, (pid,), (16,)) - ct.store(c, (pid,), ta + tb) - return - end - end - - a = CUDA.ones(Float32, 1024) - b = CUDA.ones(Float32, 1024) - c = CUDA.zeros(Float32, 1024) - - ct.launch(mod.vadd_kernel, 64, a, b, c) - @test Array(c) ≈ Array(a) + Array(b) - - @eval mod begin - function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, (pid,), (16,)) - tb = ct.load(b, (pid,), (16,)) - ct.store(c, (pid,), ta + tb * 2) - return - end - end - - ct.launch(mod.vadd_kernel, 64, a, b, c) - @test Array(c) ≈ Array(a) + Array(b) * 2 -end - -@testset "redefine called function" begin - mod = @eval module $(gensym()) - import cuTile as ct - combine(a, b) = a + b - function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - ta = ct.load(a, (pid,), (16,)) - tb = ct.load(b, (pid,), (16,)) - ct.store(c, (pid,), combine(ta, tb)) - return - end - end - - a = CUDA.ones(Float32, 1024) - b = CUDA.ones(Float32, 1024) - c = CUDA.zeros(Float32, 1024) - - ct.launch(mod.vadd_kernel, 64, a, b, c) - @test Array(c) ≈ Array(a) + Array(b) - - @eval mod combine(a, b) = a + b * 2 - - ct.launch(mod.vadd_kernel, 64, a, b, c) - @test Array(c) ≈ Array(a) + Array(b) * 2 -end - -@testset "redefine reduce subprogram" begin - mod = @eval module $(gensym()) - import cuTile as ct - combine(a, b) = a + b - function reduce_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, (pid, 1), (1, 128)) - sums = reduce(combine, tile; dims=2, init=0.0f0) - ct.store(b, pid, sums) - return - end - end - - m, n = 64, 128 - a = CUDA.ones(Float32, m, n) - b = CUDA.zeros(Float32, m) - - ct.launch(mod.reduce_kernel, m, a, b) - @test all(Array(b) .≈ Float32(n)) - - # Redefine to max (associative+commutative, tree-order independent) - @eval mod combine(a, b) = max(a, b) - - ct.launch(mod.reduce_kernel, m, a, b) - @test all(Array(b) .≈ 1.0f0) -end - -@testset "redefine scan subprogram" begin - mod = @eval module $(gensym()) - import cuTile as ct - combine(a, b) = a + b - function scan_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile = ct.load(a, pid, (128,)) - scanned = accumulate(combine, tile; dims=1, init=0.0f0) - ct.store(b, pid, scanned) - return - end - end - - n = 128 - a = CUDA.ones(Float32, n) - b = CUDA.zeros(Float32, n) - - ct.launch(mod.scan_kernel, 1, a, b) - expected = Float32.(cumsum(ones(Float32, n))) - @test Array(b) ≈ expected - - # Redefine to max (associative+commutative, tree-order independent) - @eval mod combine(a, b) = max(a, b) - - ct.launch(mod.scan_kernel, 1, a, b) - # Running max over [1,1,...,1] with init=0 gives [1,1,...,1] - @test all(Array(b) .≈ 1.0f0) -end - -end # invalidations - -@testset "reflection macros" begin - function reflect_vadd(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, - c::ct.TileArray{Float32,1}) - pid = ct.bid(1) - tile_a = ct.load(a, pid, (16,)) - tile_b = ct.load(b, pid, (16,)) - ct.store(c, pid, tile_a + tile_b) - return - end - - n = 1024 - a = CUDA.rand(Float32, n) - b = CUDA.rand(Float32, n) - c = CUDA.zeros(Float32, n) - - # @device_code_tiled: check Tile IR output and verify execution - @test @filecheck begin - @check "entry @reflect_vadd" - @check "tile>" - @check "get_tile_block_id" - @check "load_view" - @check "addf" - @check "store_view" - buf = IOBuffer() - ct.@device_code_tiled io=buf ct.launch(reflect_vadd, cld(n, 16), a, b, c) - String(take!(buf)) - end - @test Array(c) ≈ Array(a) + Array(b) - - # @device_code_structured: check StructuredIRCode output - @test @filecheck begin - @check "StructuredIRCode" - @check "get_tile_block_id" - @check "load_partition_view" - @check "addf" - @check "store_partition_view" - buf = IOBuffer() - ct.@device_code_structured io=buf ct.launch(reflect_vadd, cld(n, 16), a, b, c) - String(take!(buf)) - end - - # @device_code_typed: check typed Julia IR output - @test @filecheck begin - @check "// reflect_vadd" - @check "get_tile_block_id" - @check "load_partition_view" - @check "addf" - @check "store_partition_view" - buf = IOBuffer() - ct.@device_code_typed io=buf ct.launch(reflect_vadd, cld(n, 16), a, b, c) - String(take!(buf)) - end -end - -@testset "assert" begin - @testset "passing assertion with message" begin - function assert_msg_kernel(a::ct.TileArray{Float32,1}, tile_size::ct.Constant{Int}) - bid = ct.bid(1) - ct.@assert bid > Int32(0) "bid must be positive" - t = ct.load(a, bid, (tile_size[],)) - ct.store(a, bid, t) - return - end - - a = CUDA.ones(Float32, 1024) - ct.launch(assert_msg_kernel, cld(1024, 128), a, ct.Constant(128)) - CUDA.synchronize() - @test all(Array(a) .== 1.0f0) - end - - @testset "passing assertion without message" begin - function assert_nomsg_kernel(a::ct.TileArray{Float32,1}, tile_size::ct.Constant{Int}) - bid = ct.bid(1) - ct.@assert bid > Int32(0) - t = ct.load(a, bid, (tile_size[],)) - ct.store(a, bid, t) - return - end - - a = CUDA.ones(Float32, 1024) - ct.launch(assert_nomsg_kernel, cld(1024, 128), a, ct.Constant(128)) - CUDA.synchronize() - @test all(Array(a) .== 1.0f0) - end - - @testset "failing assertion" begin - # Failed assertions crash the CUDA context, so we must test in a subprocess - # (following the same pattern as cuTile Python's test_assert.py) - script = """ - using CUDA - import cuTile as ct - - function assert_fail_kernel(a::ct.TileArray{Float32,1}, tile_size::ct.Constant{Int}) - bid = ct.bid(1) - ct.@assert bid > Int32(999999) "custom assert message" - t = ct.load(a, bid, (tile_size[],)) - ct.store(a, bid, t) - return - end - - a = CUDA.ones(Float32, 1024) - ct.launch(assert_fail_kernel, cld(1024, 128), a, ct.Constant(128)) - CUDA.synchronize() - """ - cmd = `$(Base.julia_cmd()) --project=$(Base.active_project()) -e $script` - output = Pipe() - proc = run(pipeline(ignorestatus(cmd); stdout=output, stderr=output); wait=false) - close(output.in) - reader = @async read(output, String) - wait(proc) - result = fetch(reader) - @test proc.exitcode != 0 - @test contains(result, "custom assert message") - end -end - diff --git a/test/execution/advanced.jl b/test/execution/advanced.jl new file mode 100644 index 0000000..6207a1d --- /dev/null +++ b/test/execution/advanced.jl @@ -0,0 +1,236 @@ +using CUDA + +@testset "invalidations" begin + +@testset "redefine kernel" begin + mod = @eval module $(gensym()) + import cuTile as ct + function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, (pid,), (16,)) + tb = ct.load(b, (pid,), (16,)) + ct.store(c, (pid,), ta + tb) + return + end + end + + a = CUDA.ones(Float32, 1024) + b = CUDA.ones(Float32, 1024) + c = CUDA.zeros(Float32, 1024) + + ct.launch(mod.vadd_kernel, 64, a, b, c) + @test Array(c) ≈ Array(a) + Array(b) + + @eval mod begin + function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, (pid,), (16,)) + tb = ct.load(b, (pid,), (16,)) + ct.store(c, (pid,), ta + tb * 2) + return + end + end + + ct.launch(mod.vadd_kernel, 64, a, b, c) + @test Array(c) ≈ Array(a) + Array(b) * 2 +end + +@testset "redefine called function" begin + mod = @eval module $(gensym()) + import cuTile as ct + combine(a, b) = a + b + function vadd_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, (pid,), (16,)) + tb = ct.load(b, (pid,), (16,)) + ct.store(c, (pid,), combine(ta, tb)) + return + end + end + + a = CUDA.ones(Float32, 1024) + b = CUDA.ones(Float32, 1024) + c = CUDA.zeros(Float32, 1024) + + ct.launch(mod.vadd_kernel, 64, a, b, c) + @test Array(c) ≈ Array(a) + Array(b) + + @eval mod combine(a, b) = a + b * 2 + + ct.launch(mod.vadd_kernel, 64, a, b, c) + @test Array(c) ≈ Array(a) + Array(b) * 2 +end + +@testset "redefine reduce subprogram" begin + mod = @eval module $(gensym()) + import cuTile as ct + combine(a, b) = a + b + function reduce_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + sums = reduce(combine, tile; dims=2, init=0.0f0) + ct.store(b, pid, sums) + return + end + end + + m, n = 64, 128 + a = CUDA.ones(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(mod.reduce_kernel, m, a, b) + @test all(Array(b) .≈ Float32(n)) + + # Redefine to max (associative+commutative, tree-order independent) + @eval mod combine(a, b) = max(a, b) + + ct.launch(mod.reduce_kernel, m, a, b) + @test all(Array(b) .≈ 1.0f0) +end + +@testset "redefine scan subprogram" begin + mod = @eval module $(gensym()) + import cuTile as ct + combine(a, b) = a + b + function scan_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (128,)) + scanned = accumulate(combine, tile; dims=1, init=0.0f0) + ct.store(b, pid, scanned) + return + end + end + + n = 128 + a = CUDA.ones(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(mod.scan_kernel, 1, a, b) + expected = Float32.(cumsum(ones(Float32, n))) + @test Array(b) ≈ expected + + # Redefine to max (associative+commutative, tree-order independent) + @eval mod combine(a, b) = max(a, b) + + ct.launch(mod.scan_kernel, 1, a, b) + # Running max over [1,1,...,1] with init=0 gives [1,1,...,1] + @test all(Array(b) .≈ 1.0f0) +end + +end # invalidations + +@testset "reflection macros" begin + function reflect_vadd(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a + tile_b) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + # @device_code_tiled: check Tile IR output and verify execution + @test @filecheck begin + @check "entry @reflect_vadd" + @check "tile>" + @check "get_tile_block_id" + @check "load_view" + @check "addf" + @check "store_view" + buf = IOBuffer() + ct.@device_code_tiled io=buf ct.launch(reflect_vadd, cld(n, 16), a, b, c) + String(take!(buf)) + end + @test Array(c) ≈ Array(a) + Array(b) + + # @device_code_structured: check StructuredIRCode output + @test @filecheck begin + @check "StructuredIRCode" + @check "get_tile_block_id" + @check "load_partition_view" + @check "addf" + @check "store_partition_view" + buf = IOBuffer() + ct.@device_code_structured io=buf ct.launch(reflect_vadd, cld(n, 16), a, b, c) + String(take!(buf)) + end + + # @device_code_typed: check typed Julia IR output + @test @filecheck begin + @check "// reflect_vadd" + @check "get_tile_block_id" + @check "load_partition_view" + @check "addf" + @check "store_partition_view" + buf = IOBuffer() + ct.@device_code_typed io=buf ct.launch(reflect_vadd, cld(n, 16), a, b, c) + String(take!(buf)) + end +end + +@testset "assert" begin + @testset "passing assertion with message" begin + function assert_msg_kernel(a::ct.TileArray{Float32,1}, tile_size::ct.Constant{Int}) + bid = ct.bid(1) + ct.@assert bid > Int32(0) "bid must be positive" + t = ct.load(a, bid, (tile_size[],)) + ct.store(a, bid, t) + return + end + + a = CUDA.ones(Float32, 1024) + ct.launch(assert_msg_kernel, cld(1024, 128), a, ct.Constant(128)) + CUDA.synchronize() + @test all(Array(a) .== 1.0f0) + end + + @testset "passing assertion without message" begin + function assert_nomsg_kernel(a::ct.TileArray{Float32,1}, tile_size::ct.Constant{Int}) + bid = ct.bid(1) + ct.@assert bid > Int32(0) + t = ct.load(a, bid, (tile_size[],)) + ct.store(a, bid, t) + return + end + + a = CUDA.ones(Float32, 1024) + ct.launch(assert_nomsg_kernel, cld(1024, 128), a, ct.Constant(128)) + CUDA.synchronize() + @test all(Array(a) .== 1.0f0) + end + + @testset "failing assertion" begin + # Failed assertions crash the CUDA context, so we must test in a subprocess + # (following the same pattern as cuTile Python's test_assert.py) + script = """ + using CUDA + import cuTile as ct + + function assert_fail_kernel(a::ct.TileArray{Float32,1}, tile_size::ct.Constant{Int}) + bid = ct.bid(1) + ct.@assert bid > Int32(999999) "custom assert message" + t = ct.load(a, bid, (tile_size[],)) + ct.store(a, bid, t) + return + end + + a = CUDA.ones(Float32, 1024) + ct.launch(assert_fail_kernel, cld(1024, 128), a, ct.Constant(128)) + CUDA.synchronize() + """ + cmd = `$(Base.julia_cmd()) --project=$(Base.active_project()) -e $script` + output = Pipe() + proc = run(pipeline(ignorestatus(cmd); stdout=output, stderr=output); wait=false) + close(output.in) + reader = @async read(output, String) + wait(proc) + result = fetch(reader) + @test proc.exitcode != 0 + @test contains(result, "custom assert message") + end +end diff --git a/test/execution/atomics.jl b/test/execution/atomics.jl new file mode 100644 index 0000000..ffec3b4 --- /dev/null +++ b/test/execution/atomics.jl @@ -0,0 +1,213 @@ +using CUDA + +@testset "atomic operations" begin + +@testset "atomic_add Int" begin + # Test atomic_add with Int: each thread block adds 1 to a counter + function atomic_add_kernel(counters::ct.TileArray{Int,1}) + bid = ct.bid(1) + ct.atomic_add(counters, 1, 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + n_blocks = 1000 + counters = CUDA.zeros(Int, 1) + + ct.launch(atomic_add_kernel, n_blocks, counters) + + result = Array(counters)[1] + @test result == n_blocks +end + +@testset "atomic_add Float32" begin + # Test atomic_add with Float32 + function atomic_add_f32_kernel(out::ct.TileArray{Float32,1}, val::ct.Constant{Float32}) + bid = ct.bid(1) + ct.atomic_add(out, 1, val[]; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + n_blocks = 100 + out = CUDA.zeros(Float32, 1) + val = 1.5f0 + + ct.launch(atomic_add_f32_kernel, n_blocks, out, ct.Constant(val)) + + result = Array(out)[1] + @test result ≈ n_blocks * val rtol=1e-3 +end + +@testset "atomic_xchg" begin + # Test atomic_xchg: each thread exchanges, last one wins + function atomic_xchg_kernel(arr::ct.TileArray{Int,1}) + bid = ct.bid(1) + ct.atomic_xchg(arr, 1, bid + 1; + memory_order=ct.MemoryOrder.AcqRel) + return + end + + n_blocks = 10 + arr = CUDA.zeros(Int, 1) + + ct.launch(atomic_xchg_kernel, n_blocks, arr) + + # Result should be one of 1..n_blocks (whichever thread ran last) + result = Array(arr)[1] + @test 1 <= result <= n_blocks +end + +@testset "atomic_cas success" begin + # Test atomic_cas: only one thread should succeed in setting 0->1 + function atomic_cas_kernel(locks::ct.TileArray{Int,1}, success_count::ct.TileArray{Int,1}) + bid = ct.bid(1) + # Try to acquire lock (0 -> 1) + old = ct.atomic_cas(locks, 1, 0, 1; + memory_order=ct.MemoryOrder.AcqRel) + # If we got old=0, we succeeded + # Use atomic_add to count successes (returns a tile, so comparison works) + # Actually simpler: just increment success_count if old was 0 + # But we can't do conditionals easily here, so let's just verify lock changes + return + end + + locks = CUDA.zeros(Int, 1) + success_count = CUDA.zeros(Int, 1) + + ct.launch(atomic_cas_kernel, 100, locks, success_count) + + # Lock should be set to 1 (at least one thread succeeded) + lock_val = Array(locks)[1] + @test lock_val == 1 +end + +@testset "spinlock with token ordering" begin + # Test that token threading enforces memory ordering in spinlock patterns + function spinlock_kernel(result::ct.TileArray{Float32,1}, lock::ct.TileArray{Int,1}) + bid = ct.bid(1) + val = ct.full((1,), 1.0f0, Float32) + + # Spin until we acquire the lock (CAS returns old value, 0 means we got it) + while ct.atomic_cas(lock, 1, 0, 1; + memory_order=ct.MemoryOrder.Acquire) == 1 + end + + # Critical section: load, increment, store + # With proper token threading, these are ordered after the acquire + current = ct.load(result, 1, (1,)) + updated = current .+ val + ct.store(result, 1, updated) + + # Release the lock + ct.atomic_xchg(lock, 1, 0; + memory_order=ct.MemoryOrder.Release) + return + end + + n_blocks = 50 # Use fewer blocks to reduce test time + result = CUDA.zeros(Float32, 1) + lock = CUDA.zeros(Int, 1) + + ct.launch(spinlock_kernel, n_blocks, result, lock) + + # Each block should have added 1.0 to the result + final_result = Array(result)[1] + @test final_result == Float32(n_blocks) +end + +@testset "explicit memory ordering kwargs" begin + # Test that explicit memory_order kwargs work correctly + function explicit_ordering_kernel(result::ct.TileArray{Float32,1}, lock::ct.TileArray{Int,1}) + bid = ct.bid(1) + val = ct.full((1,), 1.0f0, Float32) + + # Spin until we acquire the lock - use explicit Acquire ordering + while ct.atomic_cas(lock, 1, 0, 1; + memory_order=ct.MemoryOrder.Acquire) == 1 + end + + # Critical section + current = ct.load(result, 1, (1,)) + updated = current .+ val + ct.store(result, 1, updated) + + # Release the lock - use explicit Release ordering + ct.atomic_xchg(lock, 1, 0; memory_order=ct.MemoryOrder.Release) + return + end + + n_blocks = 50 + result = CUDA.zeros(Float32, 1) + lock = CUDA.zeros(Int, 1) + + ct.launch(explicit_ordering_kernel, n_blocks, result, lock) + + final_result = Array(result)[1] + @test final_result == Float32(n_blocks) +end + +@testset "atomic_add with explicit kwargs" begin + # Test atomic_add with explicit memory ordering + function explicit_add_kernel(counters::ct.TileArray{Int,1}) + bid = ct.bid(1) + ct.atomic_add(counters, 1, 1; + memory_order=ct.MemoryOrder.Relaxed, + memory_scope=ct.MemScope.Device) + return + end + + n_blocks = 100 + counters = CUDA.zeros(Int, 1) + + ct.launch(explicit_add_kernel, n_blocks, counters) + + result = Array(counters)[1] + @test result == n_blocks +end + +@testset "1D gather - simple" begin + # Simple 1D gather: copy first 16 elements using gather + function gather_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Simple indices 0..15 + indices = ct.arange((16,), Int) + # Gather from source + tile = ct.gather(src, indices) + # Store to destination + ct.store(dst, pid, tile) + return + end + + n = 16 + src = CUDA.rand(Float32, n) + dst = CUDA.zeros(Float32, n) + + ct.launch(gather_simple_kernel, 1, src, dst) + + @test Array(dst) ≈ Array(src) +end + +@testset "1D scatter - simple" begin + # Simple 1D scatter: write first 16 elements using scatter + function scatter_simple_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Load from source + tile = ct.load(src, pid, (16,)) + # Simple indices 0..15 + indices = ct.arange((16,), Int) + # Scatter to destination + ct.scatter(dst, indices, tile) + return + end + + n = 16 + src = CUDA.rand(Float32, n) + dst = CUDA.zeros(Float32, n) + + ct.launch(scatter_simple_kernel, 1, src, dst) + + @test Array(dst) ≈ Array(src) +end + +end diff --git a/test/execution/basic.jl b/test/execution/basic.jl new file mode 100644 index 0000000..750f5d3 --- /dev/null +++ b/test/execution/basic.jl @@ -0,0 +1,1060 @@ +using CUDA + +@testset "launch" begin + +@testset "1D vector add" begin + function vadd_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a + tile_b) + return + end + + n = 1024 + tile_size = 16 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_1d, cld(n, tile_size), a, b, c) + + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "1D vector sub" begin + function vsub_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a - tile_b) + return + end + + n = 1024 + tile_size = 16 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(vsub_1d, cld(n, tile_size), a, b, c) + + @test Array(c) ≈ Array(a) - Array(b) +end + +@testset "1D vector mul" begin + function vmul_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a .* tile_b) + return + end + + n = 1024 + tile_size = 16 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(vmul_1d, cld(n, tile_size), a, b, c) + + @test Array(c) ≈ Array(a) .* Array(b) +end + +@testset "2D matrix add" begin + function madd_2d(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + bidx = ct.bid(1) + bidy = ct.bid(2) + tile_a = ct.load(a, (bidx, bidy), (32, 32)) + tile_b = ct.load(b, (bidx, bidy), (32, 32)) + ct.store(c, (bidx, bidy), tile_a + tile_b) + return + end + + m, n = 256, 256 + tile_x, tile_y = 32, 32 + a = CUDA.rand(Float32, m, n) + b = CUDA.rand(Float32, m, n) + c = CUDA.zeros(Float32, m, n) + + ct.launch(madd_2d, (cld(m, tile_x), cld(n, tile_y)), a, b, c) + + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "4D tensor add" begin + # 4D loads require TileArray with explicit sizes (grid only provides 3D) + function tadd_4d(a::ct.TileArray{Float32,4}, b::ct.TileArray{Float32,4}, + c::ct.TileArray{Float32,4}) + bidx = ct.bid(1) + bidy = ct.bid(2) + bidz = ct.bid(3) + # Load 4D tiles - 4th dimension index is fixed at 1 + tile_a = ct.load(a, (bidx, bidy, bidz, 1), (4, 4, 4, 2)) + tile_b = ct.load(b, (bidx, bidy, bidz, 1), (4, 4, 4, 2)) + ct.store(c, (bidx, bidy, bidz, 1), tile_a + tile_b) + return + end + + # Array shape: (d1, d2, d3, d4) with tile shape (4, 4, 4, 2) + d1, d2, d3, d4 = 16, 16, 8, 2 + tile_1, tile_2, tile_3, tile_4 = 4, 4, 4, 2 + a = CUDA.rand(Float32, d1, d2, d3, d4) + b = CUDA.rand(Float32, d1, d2, d3, d4) + c = CUDA.zeros(Float32, d1, d2, d3, d4) + + grid = (cld(d1, tile_1), cld(d2, tile_2), cld(d3, tile_3)) + ct.launch(tadd_4d, grid, a, b, c) + + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "rank mismatch load/store" begin + @testset "1D shape on 2D array" begin + function copy_1d_2d(src::ct.TileArray{Float32,2}, dst::ct.TileArray{Float32,2}) + bid = ct.bid(1) + tile = ct.load(src, (bid, 1), (16,)) + ct.store(dst, (bid, 1), tile) + return + end + + m = 64 + src = CUDA.rand(Float32, m, 1) + dst = CUDA.zeros(Float32, m, 1) + + ct.launch(copy_1d_2d, cld(m, 16), src, dst) + + @test Array(dst) ≈ Array(src) + end + + @testset "2D shape on 4D array" begin + function copy_2d_4d(src::ct.TileArray{Float32,4}, dst::ct.TileArray{Float32,4}) + bidx = ct.bid(1) + bidy = ct.bid(2) + tile = ct.load(src, (bidx, bidy, 1, 1), (4, 4)) + ct.store(dst, (bidx, bidy, 1, 1), tile) + return + end + + d1, d2 = 16, 16 + src = CUDA.rand(Float32, d1, d2, 1, 1) + dst = CUDA.zeros(Float32, d1, d2, 1, 1) + + ct.launch(copy_2d_4d, (cld(d1, 4), cld(d2, 4)), src, dst) + + @test Array(dst) ≈ Array(src) + end +end + +@testset "transpose" begin + function transpose_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) + bidx = ct.bid(1) + bidy = ct.bid(2) + tile = ct.load(x, (bidx, bidy), (32, 32)) + transposed = transpose(tile) + ct.store(y, (bidy, bidx), transposed) + return + end + + m, n = 256, 128 + tile_size = 32 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, n, m) + + ct.launch(transpose_kernel, (cld(m, tile_size), cld(n, tile_size)), x, y) + + @test Array(y) ≈ transpose(Array(x)) +end + +@testset "reshape" begin + @testset "2D -> 1D reshape preserves elements" begin + function reshape_2d_to_1d_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,1}) + bid = ct.bid(1) + # Load a 4x8 tile + tile = ct.load(x, (bid, 1), (4, 8)) + # Reshape to 32 elements (flat) + reshaped = reshape(tile, (32,)) + ct.store(y, bid, reshaped) + return + end + + m, n = 64, 8 + x = CUDA.rand(Float32, m, n) + # Each of the m/4 = 16 blocks produces 32 elements + y = CUDA.zeros(Float32, m * n) + + ct.launch(reshape_2d_to_1d_kernel, cld(m, 4), x, y) + + # Verify all elements are preserved (same multiset) + x_cpu = Array(x) + y_cpu = Array(y) + for bid in 0:(cld(m, 4)-1) + row_start = bid * 4 + 1 + row_end = row_start + 3 + input_elements = sort(vec(x_cpu[row_start:row_end, 1:8])) + output_elements = sort(y_cpu[(bid*32+1):((bid+1)*32)]) + @test output_elements ≈ input_elements + end + end + + @testset "1D -> 2D reshape preserves elements" begin + function reshape_1d_to_2d_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load 32 elements + tile = ct.load(x, bid, (32,)) + # Reshape to 4x8 + reshaped = reshape(tile, (4, 8)) + ct.store(y, (bid, 1), reshaped) + return + end + + n = 512 + x = CUDA.rand(Float32, n) + m_out = n ÷ 8 + y = CUDA.zeros(Float32, m_out, 8) + + ct.launch(reshape_1d_to_2d_kernel, cld(n, 32), x, y) + + # Verify all elements are preserved (same multiset) + x_cpu = Array(x) + y_cpu = Array(y) + for bid in 0:(cld(n, 32)-1) + start_idx = bid * 32 + 1 + input_elements = sort(x_cpu[start_idx:(start_idx+31)]) + row_start = bid * 4 + 1 + output_elements = sort(vec(y_cpu[row_start:(row_start+3), 1:8])) + @test output_elements ≈ input_elements + end + end + + @testset "reshape roundtrip preserves data" begin + function reshape_roundtrip_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load 8x4 tile + tile = ct.load(x, (bid, 1), (8, 4)) + # Reshape to 32, then back to 8x4 + flat = reshape(tile, (32,)) + back = reshape(flat, (8, 4)) + ct.store(y, (bid, 1), back) + return + end + + m, n = 64, 4 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, m, n) + + ct.launch(reshape_roundtrip_kernel, cld(m, 8), x, y) + + @test Array(y) ≈ Array(x) + end +end + +@testset "reshape column-major semantics" begin + # These tests verify that reshape matches Julia's column-major reshape behavior, + # not just that elements are preserved (which would pass even with wrong ordering). + # Note: tile shapes must be powers of 2. + + @testset "1D → 2D matches Julia reshape exactly" begin + function reshape_1d_to_2d_exact_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,2}, + n::ct.Constant{Int}, shape::ct.Constant{NTuple{2,Int}}) + bid = ct.bid(1) + tile = ct.load(x, bid, (n[],)) + reshaped = reshape(tile, shape[]) + ct.store(y, (bid, 1), reshaped) + return + end + + n = 32 + shape = (4, 8) + # Sequential values to detect any reordering + x = CuArray(Float32.(1:n)) + y = CUDA.zeros(Float32, shape) + + ct.launch(reshape_1d_to_2d_exact_kernel, 1, x, y, ct.Constant(n), ct.Constant(shape)) + + # Must match Julia's column-major reshape exactly (not just same elements) + expected = reshape(Float32.(1:n), shape) + @test Array(y) ≈ expected + end + + @testset "2D → 1D matches Julia vec exactly" begin + function reshape_2d_to_1d_exact_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,1}, + shape::ct.Constant{NTuple{2,Int}}, n::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(x, (bid, 1), shape[]) + reshaped = reshape(tile, (n[],)) + ct.store(y, bid, reshaped) + return + end + + shape = (4, 8) + n = prod(shape) + # Create 2D array with sequential column-major values + x = CuArray(Float32.(reshape(1:n, shape))) + y = CUDA.zeros(Float32, n) + + ct.launch(reshape_2d_to_1d_exact_kernel, 1, x, y, ct.Constant(shape), ct.Constant(n)) + + # Flattening should give column-major order: 1,2,3,4,...,32 + expected = vec(Float32.(reshape(1:n, shape))) + @test Array(y) ≈ expected + end + + @testset "2D → 2D reshape matches Julia reshape exactly" begin + function reshape_2d_to_2d_exact_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}, + src_shape::ct.Constant{NTuple{2,Int}}, + tgt_shape::ct.Constant{NTuple{2,Int}}) + bid = ct.bid(1) + tile = ct.load(x, (bid, 1), src_shape[]) + reshaped = reshape(tile, tgt_shape[]) + ct.store(y, (bid, 1), reshaped) + return + end + + src_shape = (4, 8) + tgt_shape = (8, 4) + n = prod(src_shape) + x = CuArray(Float32.(reshape(1:n, src_shape))) + y = CUDA.zeros(Float32, tgt_shape) + + ct.launch(reshape_2d_to_2d_exact_kernel, 1, x, y, + ct.Constant(src_shape), ct.Constant(tgt_shape)) + + expected = reshape(Float32.(reshape(1:n, src_shape)), tgt_shape) + @test Array(y) ≈ expected + end + + @testset "3D → 2D reshape matches Julia reshape exactly" begin + function reshape_3d_to_2d_exact_kernel(x::ct.TileArray{Float32,3}, y::ct.TileArray{Float32,2}, + src_shape::ct.Constant{NTuple{3,Int}}, + tgt_shape::ct.Constant{NTuple{2,Int}}) + bid = ct.bid(1) + tile = ct.load(x, (bid, 1, 1), src_shape[]) + reshaped = reshape(tile, tgt_shape[]) + ct.store(y, (bid, 1), reshaped) + return + end + + src_shape = (2, 4, 4) + tgt_shape = (8, 4) + n = prod(src_shape) + x = CuArray(Float32.(reshape(1:n, src_shape))) + y = CUDA.zeros(Float32, tgt_shape) + + ct.launch(reshape_3d_to_2d_exact_kernel, 1, x, y, + ct.Constant(src_shape), ct.Constant(tgt_shape)) + + expected = reshape(Float32.(reshape(1:n, src_shape)), tgt_shape) + @test Array(y) ≈ expected + end + + @testset "3D reshape round-trip with packing dim D=$D" for D in [2, 4] + # This is the atom_packing pattern: (BS, N, 2) → (BS, N*2/D, D) → (BS, N, 2) + function reshape_roundtrip_3d_kernel(x::ct.TileArray{Float32,3}, y::ct.TileArray{Float32,3}, + orig_shape::ct.Constant{NTuple{3,Int}}, + packed_shape::ct.Constant{NTuple{3,Int}}) + bid = ct.bid(1) + tile = ct.load(x, (bid, 1, 1), orig_shape[]) + packed = reshape(tile, packed_shape[]) + unpacked = reshape(packed, orig_shape[]) + ct.store(y, (bid, 1, 1), unpacked) + return + end + + BS, N = 1, 8 + orig_shape = (BS, N, 2) + packed_shape = (BS, N * 2 ÷ D, D) + + # Sequential values to detect any reordering + x = CuArray(Float32.(reshape(1:prod(orig_shape), orig_shape))) + y = CUDA.zeros(Float32, orig_shape) + + ct.launch(reshape_roundtrip_3d_kernel, 1, x, y, + ct.Constant(orig_shape), ct.Constant(packed_shape)) + + # Round-trip must preserve exact data, not just same elements + @test Array(y) ≈ Array(x) + end + + @testset "2D → 1D → 2D round-trip preserves exact layout" begin + function reshape_2d_1d_2d_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}, + shape::ct.Constant{NTuple{2,Int}}) + bid = ct.bid(1) + tile = ct.load(x, (bid, 1), shape[]) + flat = reshape(tile, (prod(shape[]),)) + back = reshape(flat, shape[]) + ct.store(y, (bid, 1), back) + return + end + + shape = (4, 8) + x = CuArray(Float32.(reshape(1:prod(shape), shape))) + y = CUDA.zeros(Float32, shape) + + ct.launch(reshape_2d_1d_2d_kernel, 1, x, y, ct.Constant(shape)) + + @test Array(y) ≈ Array(x) + end +end + +@testset "permutedims" begin + @testset "2D permutedims (transpose-like)" begin + function permute_2d_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load 8x4 tile + tile = ct.load(x, (bid, 1), (8, 4)) + # Permute with (2, 1) to swap dimensions: (8, 4) -> (4, 8) + permuted = permutedims(tile, (2, 1)) + ct.store(y, (bid, 1), permuted) + return + end + + m, n = 64, 4 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, cld(m, 8) * 4, 8) + + ct.launch(permute_2d_kernel, cld(m, 8), x, y) + + # Verify permutedims matches transpose + x_cpu = Array(x) + y_cpu = Array(y) + for bid in 0:(cld(m, 8)-1) + row_start = bid * 8 + 1 + input_tile = x_cpu[row_start:(row_start+7), 1:4] + out_row_start = bid * 4 + 1 + output_tile = y_cpu[out_row_start:(out_row_start+3), 1:8] + # Compare sorted values since memory layouts may differ + @test sort(vec(output_tile)) ≈ sort(vec(transpose(input_tile))) + end + end + + @testset "permutedims roundtrip preserves data" begin + function permute_roundtrip_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load 4x8 tile + tile = ct.load(x, (bid, 1), (4, 8)) + # Permute with (2, 1), then back with (2, 1) + permuted = permutedims(tile, (2, 1)) # (4, 8) -> (8, 4) + back = permutedims(permuted, (2, 1)) # (8, 4) -> (4, 8) + ct.store(y, (bid, 1), back) + return + end + + m, n = 64, 8 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, m, n) + + ct.launch(permute_roundtrip_kernel, cld(m, 4), x, y) + + @test Array(y) ≈ Array(x) + end +end + +@testset "strided" begin + @testset "PermutedDimsArray" begin + function copy_kernel_2d( + src::ct.TileArray{Float32, 2}, dst::ct.TileArray{Float32, 2}, + tile_x::ct.Constant{Int}, tile_y::ct.Constant{Int} + ) + bid_x = ct.bid(1) + bid_y = ct.bid(2) + tile = ct.load(src, (bid_x, bid_y), (tile_x[], tile_y[])) + ct.store(dst, (bid_x, bid_y), tile) + return + end + + m, n = 64, 32 + tm, tn = 16, 16 + A = CuArray(Float32.(reshape(1:n*m, n, m))) + P = PermutedDimsArray(A, (2, 1)) + out = CUDA.zeros(Float32, m, n) + + grid = (cld(m, tm), cld(n, tn)) + ct.launch(copy_kernel_2d, grid, P, out, ct.Constant(tm), ct.Constant(tn)) + + @test out == permutedims(A, (2, 1)) + end + + @testset "load with order=(2,1)" begin + function order_load_kernel( + src::ct.TileArray{Float32, 2}, dst::ct.TileArray{Float32, 2}, + t::ct.Constant{Int} + ) + bid_x = ct.bid(1) + bid_y = ct.bid(2) + tile = ct.load(src, (bid_x, bid_y), (t[], t[]); order=(2, 1)) + ct.store(dst, (bid_x, bid_y), tile) + return + end + + n = 64; t = 16 + src = CuArray(Float32.(reshape(1:n*n, n, n))) + dst = CUDA.zeros(Float32, n, n) + + ct.launch(order_load_kernel, (cld(n, t), cld(n, t)), src, dst, ct.Constant(t)) + + @test Array(dst) ≈ transpose(Array(src)) + end + + @testset "store with order=(2,1)" begin + function order_store_kernel( + src::ct.TileArray{Float32, 2}, dst::ct.TileArray{Float32, 2}, + t::ct.Constant{Int} + ) + bid_x = ct.bid(1) + bid_y = ct.bid(2) + tile = ct.load(src, (bid_x, bid_y), (t[], t[])) + ct.store(dst, (bid_x, bid_y), tile; order=(2, 1)) + return + end + + n = 64; t = 16 + src = CuArray(Float32.(reshape(1:n*n, n, n))) + dst = CUDA.zeros(Float32, n, n) + + ct.launch(order_store_kernel, (cld(n, t), cld(n, t)), src, dst, ct.Constant(t)) + + @test Array(dst) ≈ transpose(Array(src)) + end +end + +@testset "extract" begin + @testset "extract identity (0,0) full shape" begin + function extract_identity_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load 4x8 tile + tile = ct.load(x, (bid, 1), (4, 8)) + # Extract the full tile starting at (0, 0) + extracted = ct.extract(tile, (2, 2), (4, 8)) + ct.store(y, (bid, 1), extracted) + return + end + + m, n = 64, 8 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, m, n) + + ct.launch(extract_identity_kernel, cld(m, 4), x, y) + + # Full extract at (0,0) should preserve data + @test Array(y) ≈ Array(x) + end + + @testset "extract (1,1) smaller shape" begin + function extract_smaller_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load 8x8 tile + tile = ct.load(x, (bid, 1), (8, 8)) + # Extract 4x4 at (1, 1) - top-left corner + extracted = ct.extract(tile, (1, 1), (4, 4)) + ct.store(y, (bid, 1), extracted) + return + end + + m, n = 64, 8 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, cld(m, 8) * 4, 4) + + ct.launch(extract_smaller_kernel, cld(m, 8), x, y) + + # Verify elements are preserved for top-left 4x4 + x_cpu = Array(x) + y_cpu = Array(y) + for bid in 0:(cld(m, 8)-1) + input_start = bid * 8 + 1 + input_slice = x_cpu[input_start:(input_start+3), 1:4] + output_start = bid * 4 + 1 + output_slice = y_cpu[output_start:(output_start+3), 1:4] + @test sort(vec(output_slice)) ≈ sort(vec(input_slice)) + end + end + + @testset "extract with slice indices" begin + # Extract uses SLICE INDICES, not offsets! + # For shape (8,8) -> (4,4): valid indices are {1,2} x {1,2} + # Index (2, 1) extracts rows 5-8 (the second slice in first dimension) + + function extract_all_quadrants_kernel(x::ct.TileArray{Float32,2}, + y0::ct.TileArray{Float32,2}, + y1::ct.TileArray{Float32,2}, + y2::ct.TileArray{Float32,2}, + y3::ct.TileArray{Float32,2}) + bid = ct.bid(1) + tile = ct.load(x, (bid, 1), (8, 8)) + # Extract all 4 quadrants + q0 = ct.extract(tile, (1, 1), (4, 4)) # Top-left + q1 = ct.extract(tile, (2, 1), (4, 4)) # Bottom-left + q2 = ct.extract(tile, (1, 2), (4, 4)) # Top-right + q3 = ct.extract(tile, (2, 2), (4, 4)) # Bottom-right + ct.store(y0, (bid, 1), q0) + ct.store(y1, (bid, 1), q1) + ct.store(y2, (bid, 1), q2) + ct.store(y3, (bid, 1), q3) + return + end + + # Create input with different values in each quadrant + x = CUDA.zeros(Float32, 8, 8) + x[1:4, 1:4] .= 1.0f0 # TL + x[5:8, 1:4] .= 2.0f0 # BL + x[1:4, 5:8] .= 3.0f0 # TR + x[5:8, 5:8] .= 4.0f0 # BR + + y0 = CUDA.zeros(Float32, 4, 4) + y1 = CUDA.zeros(Float32, 4, 4) + y2 = CUDA.zeros(Float32, 4, 4) + y3 = CUDA.zeros(Float32, 4, 4) + + ct.launch(extract_all_quadrants_kernel, 1, x, y0, y1, y2, y3) + + @test all(Array(y0) .≈ 1.0f0) # Top-left = 1 + @test all(Array(y1) .≈ 2.0f0) # Bottom-left = 2 + @test all(Array(y2) .≈ 3.0f0) # Top-right = 3 + @test all(Array(y3) .≈ 4.0f0) # Bottom-right = 4 + end + + @testset "extract real/imag pattern (FFT)" begin + # This is the pattern used in FFT: shape (BS, N, 2) -> (BS, N, 1) + # Real at slice index 1, imag at slice index 2 + + function extract_real_imag_kernel(x_ri::ct.TileArray{Float32,3}, + y_real::ct.TileArray{Float32,3}, + y_imag::ct.TileArray{Float32,3}) + bid = ct.bid(1) + tile = ct.load(x_ri, (bid, 1, 1), (2, 8, 2)) # (BS, N, real/imag) + # Extract real (slice 1) and imag (slice 2) in last dimension + real_part = ct.extract(tile, (1, 1, 1), (2, 8, 1)) + imag_part = ct.extract(tile, (1, 1, 2), (2, 8, 1)) + ct.store(y_real, (bid, 1, 1), real_part) + ct.store(y_imag, (bid, 1, 1), imag_part) + return + end + + # Create input: real=1.0, imag=2.0 + x = CUDA.zeros(Float32, 2, 8, 2) + x[:, :, 1] .= 1.0f0 # real + x[:, :, 2] .= 2.0f0 # imag + + y_real = CUDA.zeros(Float32, 2, 8, 1) + y_imag = CUDA.zeros(Float32, 2, 8, 1) + + ct.launch(extract_real_imag_kernel, 1, x, y_real, y_imag) + + @test all(Array(y_real) .≈ 1.0f0) # Real component + @test all(Array(y_imag) .≈ 2.0f0) # Imag component + end +end + +@testset "scalar tile getindex" begin + function tile_getindex_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,1}) + tile = ct.load(x, 1, (8,)) + scalar = tile[3] # Extract 3rd element + ct.store(y, 1, ct.broadcast_to(ct.Tile(scalar), (8,))) + return + end + host_x = zeros(Float32, 8) + host_x[3] = 42.0f0 + x = CuArray(host_x) + y = CUDA.zeros(Float32, 8) + ct.launch(tile_getindex_kernel, 1, x, y) + @test all(Array(y) .≈ 42.0f0) +end + +@testset "scalar tile setindex" begin + function tile_setindex_kernel(x::ct.TileArray{Float32,1}, y::ct.TileArray{Float32,1}) + tile = ct.load(x, 1, (8,)) + new_tile = Base.setindex(tile, 0.0f0, 3) + ct.store(y, 1, new_tile) + return + end + x = CuArray(Float32.(1:8)) + y = CUDA.zeros(Float32, 8) + ct.launch(tile_setindex_kernel, 1, x, y) + expected = Float32.(1:8) + expected[3] = 0.0f0 + @test Array(y) ≈ expected +end + +@testset "cat" begin + @testset "cat along last axis (axis -1)" begin + function cat_last_axis_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load two (4, 4) tiles + tile_a = ct.load(a, (bid, 1), (4, 4)) + tile_b = ct.load(b, (bid, 1), (4, 4)) + # Concatenate along last axis -> (4, 8) + combined = ct.cat((tile_a, tile_b), Val(-1)) + ct.store(c, (bid, 1), combined) + return + end + + m, n = 64, 4 + a = CUDA.rand(Float32, m, n) + b = CUDA.rand(Float32, m, n) + c = CUDA.zeros(Float32, m, 8) + + ct.launch(cat_last_axis_kernel, cld(m, 4), a, b, c) + + # Verify concatenation: c[:, 1:4] should match a, c[:, 5:8] should match b + c_cpu = Array(c) + a_cpu = Array(a) + b_cpu = Array(b) + + # Due to memory layout, verify elements are preserved by checking sorted values + for bid in 0:(cld(m, 4)-1) + start_row = bid * 4 + 1 + input_a = a_cpu[start_row:(start_row+3), :] + input_b = b_cpu[start_row:(start_row+3), :] + output = c_cpu[start_row:(start_row+3), :] + + # Combined output should contain all elements from both inputs + expected = sort(vcat(vec(input_a), vec(input_b))) + actual = sort(vec(output)) + @test actual ≈ expected + end + end + + @testset "cat along first axis (axis 1)" begin + function cat_first_axis_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load two (4, 4) tiles + tile_a = ct.load(a, (bid, 1), (4, 4)) + tile_b = ct.load(b, (bid, 1), (4, 4)) + # Concatenate along first axis -> (8, 4) + combined = ct.cat((tile_a, tile_b), Val(1)) + ct.store(c, (bid, 1), combined) + return + end + + m, n = 32, 4 + a = CUDA.rand(Float32, m, n) + b = CUDA.rand(Float32, m, n) + c = CUDA.zeros(Float32, m * 2, n) + + ct.launch(cat_first_axis_kernel, cld(m, 4), a, b, c) + + # Verify concatenation: elements from both inputs should be preserved + c_cpu = Array(c) + a_cpu = Array(a) + b_cpu = Array(b) + + for bid in 0:(cld(m, 4)-1) + start_a = bid * 4 + 1 + start_c = bid * 8 + 1 + input_a = a_cpu[start_a:(start_a+3), :] + input_b = b_cpu[start_a:(start_a+3), :] + output = c_cpu[start_c:(start_c+7), :] + + # Combined output should contain all elements from both inputs + expected = sort(vcat(vec(input_a), vec(input_b))) + actual = sort(vec(output)) + @test actual ≈ expected + end + end + + @testset "cat roundtrip (extract then cat)" begin + # This tests cat as the inverse of extract: extract splits, cat joins + function extract_cat_roundtrip_kernel(x::ct.TileArray{Float32,2}, y::ct.TileArray{Float32,2}) + bid = ct.bid(1) + # Load 4x8 tile + tile = ct.load(x, (bid, 1), (4, 8)) + # Extract two 4x4 halves + left = ct.extract(tile, (1, 1), (4, 4)) # rows 1-4, cols 1-4 + right = ct.extract(tile, (1, 2), (4, 4)) # rows 1-4, cols 5-8 + # Cat them back together along last axis + combined = ct.cat((left, right), Val(-1)) + ct.store(y, (bid, 1), combined) + return + end + + m, n = 64, 8 + x = CUDA.rand(Float32, m, n) + y = CUDA.zeros(Float32, m, n) + + ct.launch(extract_cat_roundtrip_kernel, cld(m, 4), x, y) + + # Output should match input (roundtrip) + x_cpu = Array(x) + y_cpu = Array(y) + + for bid in 0:(cld(m, 4)-1) + start_row = bid * 4 + 1 + input = x_cpu[start_row:(start_row+3), :] + output = y_cpu[start_row:(start_row+3), :] + + @test output ≈ input + end + end +end + +@testset "matmul" begin + @testset "basic matmul" begin + function matmul_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + bidx = ct.bid(1) + bidy = ct.bid(2) + # Load tiles: a is (M, K), b is (K, N) + tile_a = ct.load(a, (bidx, 1), (32, 16)) + tile_b = ct.load(b, (1, bidy), (16, 32)) + # matmul: c = a @ b (using * operator) + result = tile_a * tile_b + ct.store(c, (bidx, bidy), result) + return + end + + M, K, N = 64, 16, 64 + a = CUDA.rand(Float32, M, K) + b = CUDA.rand(Float32, K, N) + c = CUDA.zeros(Float32, M, N) + + grid_x = cld(M, 32) + grid_y = cld(N, 32) + ct.launch(matmul_kernel, (grid_x, grid_y, 1), a, b, c) + + # Verify against CPU reference + a_cpu = Array(a) + b_cpu = Array(b) + c_cpu = Array(c) + c_ref = a_cpu * b_cpu + + @test c_cpu ≈ c_ref + end +end + +end + +@testset "Constant parameters" begin + +@testset "1D with Constant tile size" begin + function vadd_const_tile(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}, tile::ct.Constant{Int}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (tile[],)) + tile_b = ct.load(b, pid, (tile[],)) + ct.store(c, pid, tile_a + tile_b) + return + end + + n = 1024 + tile_size = 32 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_const_tile, cld(n, tile_size), a, b, c, ct.Constant(tile_size)) + + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "2D with Constant tile sizes" begin + function madd_const_tiles(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}, + tx::ct.Constant{Int}, ty::ct.Constant{Int}) + bidx = ct.bid(1) + bidy = ct.bid(2) + tile_a = ct.load(a, (bidx, bidy), (tx[], ty[])) + tile_b = ct.load(b, (bidx, bidy), (tx[], ty[])) + ct.store(c, (bidx, bidy), tile_a + tile_b) + return + end + + m, n = 256, 256 + tile_x, tile_y = 64, 64 + a = CUDA.rand(Float32, m, n) + b = CUDA.rand(Float32, m, n) + c = CUDA.zeros(Float32, m, n) + + ct.launch(madd_const_tiles, (cld(m, tile_x), cld(n, tile_y)), a, b, c, + ct.Constant(tile_x), ct.Constant(tile_y)) + + @test Array(c) ≈ Array(a) + Array(b) +end + +end + +@testset "data types" begin + +@testset "Float64" begin + function vadd_f64(a::ct.TileArray{Float64,1}, b::ct.TileArray{Float64,1}, + c::ct.TileArray{Float64,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a + tile_b) + return + end + + n = 1024 + tile_size = 16 + a = CUDA.rand(Float64, n) + b = CUDA.rand(Float64, n) + c = CUDA.zeros(Float64, n) + + ct.launch(vadd_f64, cld(n, tile_size), a, b, c) + + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "Float16" begin + function vadd_f16(a::ct.TileArray{Float16,1}, b::ct.TileArray{Float16,1}, + c::ct.TileArray{Float16,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a + tile_b) + return + end + + n = 1024 + tile_size = 16 + a = CUDA.rand(Float16, n) + b = CUDA.rand(Float16, n) + c = CUDA.zeros(Float16, n) + + ct.launch(vadd_f16, cld(n, tile_size), a, b, c) + + @test Array(c) ≈ Array(a) + Array(b) +end + +@testset "BFloat16" begin + function vadd_bf16(a::ct.TileArray{ct.BFloat16,1}, b::ct.TileArray{ct.BFloat16,1}, + c::ct.TileArray{ct.BFloat16,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a + tile_b) + return + end + + n = 1024 + tile_size = 16 + a = CUDA.rand(ct.BFloat16, n) + b = CUDA.rand(ct.BFloat16, n) + c = CUDA.zeros(ct.BFloat16, n) + + ct.launch(vadd_bf16, cld(n, tile_size), a, b, c) + + @test Array(c) ≈ Array(a) + Array(b) +end + +end + +@testset "compilation cache" begin + function cached_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(b, pid, tile) + return + end + + n = 256 + tile_size = 16 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + # First launch triggers compilation + ct.launch(cached_kernel, cld(n, tile_size), a, b) + @test Array(b) ≈ Array(a) + + # Second launch should use cached CuFunction + a2 = CUDA.rand(Float32, n) + b2 = CUDA.zeros(Float32, n) + ct.launch(cached_kernel, cld(n, tile_size), a2, b2) + @test Array(b2) ≈ Array(a2) +end + +@testset "TileArray auto-conversion" begin + # Test that CuArrays are automatically converted to TileArray + function copy_kernel(src::ct.TileArray{Float32,1}, dst::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(src, pid, (16,)) + ct.store(dst, pid, tile) + return + end + + n = 512 + tile_size = 16 + src = CUDA.rand(Float32, n) + dst = CUDA.zeros(Float32, n) + + # Pass CuArrays directly - should auto-convert + ct.launch(copy_kernel, cld(n, tile_size), src, dst) + + @test Array(dst) ≈ Array(src) +end + +@testset "math operations" begin + +@testset "1D vector div" begin + function vdiv_1d(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile_a = ct.load(a, pid, (16,)) + tile_b = ct.load(b, pid, (16,)) + ct.store(c, pid, tile_a ./ tile_b) + return + end + + n = 1024 + tile_size = 16 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) .+ 0.1f0 # Ensure non-zero + c = CUDA.zeros(Float32, n) + + ct.launch(vdiv_1d, cld(n, tile_size), a, b, c) + + @test Array(c) ≈ Array(a) ./ Array(b) +end + +for (op, name) in [ + (:sqrt, "sqrt"), (:abs, "abs"), (:cos, "cos"), (:sin, "sin"), + (:exp, "exp"), (:log, "log"), (:ceil, "ceil"), (:floor, "floor"), +] + @eval @testset "1D $($name)" begin + function $(Symbol("vmath_$(name)"))(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(b, pid, $op.(tile)) + return + end + a = CUDA.rand(Float32, 1024) .+ 0.1f0 + b = CUDA.zeros(Float32, 1024) + ct.launch($(Symbol("vmath_$(name)")), cld(1024, 16), a, b) + @test Array(b) ≈ $op.(Array(a)) rtol=1e-4 + end +end + +end + +const _EXEC_TEST_GLOBAL_CONST = Float32(1 / log(2)) + +@testset "global constant arithmetic" begin + # Regression test for issue #77: scalar × global constant failed during codegen. + function global_const_arith_kernel(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + scale::Float32) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + total_scale = scale * _EXEC_TEST_GLOBAL_CONST + ct.store(b, pid, tile .* total_scale) + return + end + + n = 1024 + tile_size = 16 + scale = 2.5f0 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(global_const_arith_kernel, cld(n, tile_size), a, b, scale) + + @test Array(b) ≈ Array(a) .* (scale * _EXEC_TEST_GLOBAL_CONST) +end + diff --git a/test/execution/broadcast.jl b/test/execution/broadcast.jl new file mode 100644 index 0000000..6a63d5f --- /dev/null +++ b/test/execution/broadcast.jl @@ -0,0 +1,765 @@ +using CUDA + +@testset "scalar-tile operations" begin + +for (name, kernel_expr, cpu_expr) in [ + ("tile / scalar", :(tile / 2.0f0), :(Array(a) ./ 2.0f0)), + ("tile / integer", :(tile / 4), :(Array(a) ./ 4.0f0)), + ("scalar ./ tile", :(1.0f0 ./ tile), :(1.0f0 ./ Array(a))), + ("tile .+ scalar", :(tile .+ 3.5f0), :(Array(a) .+ 3.5f0)), + ("scalar .+ tile", :(2.5f0 .+ tile), :(2.5f0 .+ Array(a))), + ("tile .- scalar", :(tile .- 1.5f0), :(Array(a) .- 1.5f0)), + ("scalar .- tile", :(5.0f0 .- tile), :(5.0f0 .- Array(a))), + ("tile * scalar", :(tile * 2.5f0), :(Array(a) .* 2.5f0)), + ("scalar * tile", :(3.0f0 * tile), :(3.0f0 .* Array(a))), +] + sym = Symbol("scalar_tile_", replace(name, r"[^a-zA-Z0-9]" => "_")) + @eval @testset $name begin + function $sym(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + ct.store(b, pid, $kernel_expr) + return + end + a = CUDA.rand(Float32, 1024) .+ 0.1f0 + b = CUDA.zeros(Float32, 1024) + ct.launch($sym, cld(1024, 16), a, b) + @test Array(b) ≈ $cpu_expr + end +end + +end + +@testset "tile broadcasting" begin + +@testset "1D broadcast: (1,) .+ (128,)" begin + # Test broadcasting a single-element tile to a larger tile + function broadcast_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Load scalar-like tile (1 element) + scalar_tile = ct.load(a, 1, (1,)) + # Load full tile (128 elements) + full_tile = ct.load(b, pid, (128,)) + # Broadcast add: (1,) .+ (128,) -> (128,) + result = scalar_tile .+ full_tile + ct.store(c, pid, result) + return + end + + n = 1024 + tile_size = 128 + a = CUDA.rand(Float32, 1) # Single element + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(broadcast_1d_kernel, cld(n, tile_size), a, b, c) + + # Each output element should be a[1] + b[i] + a_cpu = Array(a) + b_cpu = Array(b) + c_cpu = Array(c) + @test c_cpu ≈ a_cpu[1] .+ b_cpu +end + +@testset "2D broadcast: (1, 128) .+ (64, 1)" begin + # Test broadcasting 2D tiles with complementary shapes + function broadcast_2d_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + # Load row tile (1, 128) and column tile (64, 1) + row_tile = ct.load(a, (1, 1), (1, 128)) + col_tile = ct.load(b, (1, 1), (64, 1)) + # Broadcast add: (1, 128) .+ (64, 1) -> (64, 128) + result = row_tile .+ col_tile + ct.store(c, (1, 1), result) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, 1, n) # Row vector + b = CUDA.rand(Float32, m, 1) # Column vector + c = CUDA.zeros(Float32, m, n) + + ct.launch(broadcast_2d_kernel, 1, a, b, c) + + # Result should be outer sum: c[i,j] = a[1,j] + b[i,1] + a_cpu = Array(a) + b_cpu = Array(b) + c_cpu = Array(c) + expected = a_cpu .+ b_cpu # Julia broadcasting + @test c_cpu ≈ expected +end + +@testset "broadcast mul: (4, 1) .* (1, 8)" begin + function broadcast_mul_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + col_tile = ct.load(a, (1, 1), (4, 1)) + row_tile = ct.load(b, (1, 1), (1, 8)) + # Broadcast multiply: (4, 1) .* (1, 8) -> (4, 8) + result = col_tile .* row_tile + ct.store(c, (1, 1), result) + return + end + + a = CUDA.rand(Float32, 4, 1) + b = CUDA.rand(Float32, 1, 8) + c = CUDA.zeros(Float32, 4, 8) + + ct.launch(broadcast_mul_kernel, 1, a, b, c) + + a_cpu = Array(a) + b_cpu = Array(b) + c_cpu = Array(c) + expected = a_cpu .* b_cpu # Outer product + @test c_cpu ≈ expected +end + +@testset "broadcast sub: (128,) .- (1,)" begin + function broadcast_sub_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + full_tile = ct.load(a, pid, (128,)) + scalar_tile = ct.load(b, 1, (1,)) + # Broadcast subtract: (128,) .- (1,) -> (128,) + result = full_tile .- scalar_tile + ct.store(c, pid, result) + return + end + + n = 1024 + tile_size = 128 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, 1) # Single element + c = CUDA.zeros(Float32, n) + + ct.launch(broadcast_sub_kernel, cld(n, tile_size), a, b, c) + + a_cpu = Array(a) + b_cpu = Array(b) + c_cpu = Array(c) + @test c_cpu ≈ a_cpu .- b_cpu[1] +end + +@testset "broadcast div: (64, 128) ./ (1, 128)" begin + # Divide each row by a scaling vector + function broadcast_div_kernel(a::ct.TileArray{Float32,2}, scale::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + data = ct.load(a, (1, 1), (64, 128)) + scale_row = ct.load(scale, (1, 1), (1, 128)) + # Broadcast divide: (64, 128) ./ (1, 128) -> (64, 128) + result = data ./ scale_row + ct.store(c, (1, 1), result) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + scale = CUDA.rand(Float32, 1, n) .+ 0.1f0 # Non-zero scale factors + c = CUDA.zeros(Float32, m, n) + + ct.launch(broadcast_div_kernel, 1, a, scale, c) + + a_cpu = Array(a) + scale_cpu = Array(scale) + c_cpu = Array(c) + expected = a_cpu ./ scale_cpu + @test c_cpu ≈ expected +end + +@testset "explicit broadcast_to" begin + # Test ct.broadcast_to() for explicit shape broadcasting + function broadcast_to_kernel(a::ct.TileArray{Float32,2}, c::ct.TileArray{Float32,2}) + # Load a row tile (1, 128) + row_tile = ct.load(a, (1, 1), (1, 128)) + # Explicitly broadcast to (64, 128) + expanded = ct.broadcast_to(row_tile, (64, 128)) + ct.store(c, (1, 1), expanded) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, 1, n) + c = CUDA.zeros(Float32, m, n) + + ct.launch(broadcast_to_kernel, 1, a, c) + + a_cpu = Array(a) + c_cpu = Array(c) + # Each row of c should equal the single row of a + for i in 1:m + @test c_cpu[i, :] ≈ a_cpu[1, :] + end +end + +end + +@testset "comparison operations" begin + +for (name, op1, op2) in [ + ("< and >", :<, :>), + ("<= and >=", :<=, :>=), +] + sym = Symbol("cmp_", replace(name, r"[^a-zA-Z0-9]" => "_")) + @eval @testset "float $($name)" begin + function $sym(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + out1::ct.TileArray{Float32,1}, out2::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(out1, pid, ct.where(broadcast($op1, ta, tb), 1.0f0, 0.0f0)) + ct.store(out2, pid, ct.where(broadcast($op2, ta, tb), 1.0f0, 0.0f0)) + return + end + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + out1 = CUDA.zeros(Float32, n) + out2 = CUDA.zeros(Float32, n) + ct.launch($sym, cld(n, 16), a, b, out1, out2) + @test Array(out1) ≈ Float32.(broadcast($op1, Array(a), Array(b))) + @test Array(out2) ≈ Float32.(broadcast($op2, Array(a), Array(b))) + end +end + +@testset "float .== and .!=" begin + function cmp_eq_ne_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + out_eq::ct.TileArray{Float32,1}, out_ne::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(out_eq, pid, ct.where(ta .== tb, 1.0f0, 0.0f0)) + ct.store(out_ne, pid, ct.where(ta .!= tb, 1.0f0, 0.0f0)) + return + end + + n = 1024 + # Use integer-valued floats so equality is meaningful + a = CUDA.fill(Float32(1), n) + b = CUDA.fill(Float32(1), n) + # Set half to different values + CUDA.@allowscalar b[1:512] .= 2.0f0 + out_eq = CUDA.zeros(Float32, n) + out_ne = CUDA.zeros(Float32, n) + + ct.launch(cmp_eq_ne_kernel, cld(n, 16), a, b, out_eq, out_ne) + + @test Array(out_eq) ≈ Float32.(Array(a) .== Array(b)) + @test Array(out_ne) ≈ Float32.(Array(a) .!= Array(b)) +end + +@testset "tile vs scalar comparison" begin + function cmp_scalar_kernel(a::ct.TileArray{Float32,1}, + out::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + ct.store(out, pid, ct.where(ta .> 0.5f0, 1.0f0, 0.0f0)) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + out = CUDA.zeros(Float32, n) + + ct.launch(cmp_scalar_kernel, cld(n, 16), a, out) + + @test Array(out) ≈ Float32.(Array(a) .> 0.5f0) +end + +end + +@testset "power operations" begin + +@testset "tile .^ tile" begin + function pow_tt_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, ta .^ tb) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) .+ 0.5f0 # Ensure positive base + b = CUDA.rand(Float32, n) .+ 0.5f0 + c = CUDA.zeros(Float32, n) + + ct.launch(pow_tt_kernel, cld(n, 16), a, b, c) + + @test Array(c) ≈ Array(a) .^ Array(b) rtol=1e-4 +end + +@testset "tile .^ scalar" begin + function pow_ts_kernel(a::ct.TileArray{Float32,1}, c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + ct.store(c, pid, ta .^ 2.0f0) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) .+ 0.1f0 + c = CUDA.zeros(Float32, n) + + ct.launch(pow_ts_kernel, cld(n, 16), a, c) + + @test Array(c) ≈ Array(a) .^ 2.0f0 rtol=1e-4 +end + +end + +@testset "where / ifelse broadcasting" begin + +@testset "where same-shape" begin + function where_same_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + mask = ta .> tb + result = ct.where(mask, ta, tb) + ct.store(c, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(where_same_kernel, cld(n, 16), a, b, c) + + @test Array(c) ≈ ifelse.(Array(a) .> Array(b), Array(a), Array(b)) rtol=1e-5 +end + +@testset "where with scalar y" begin + function where_scalar_y_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + mask = ta .> 0.5f0 + result = ct.where(mask, ta, 0.0f0) + ct.store(b, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(where_scalar_y_kernel, cld(n, 16), a, b) + + @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, Array(a), 0.0f0) rtol=1e-5 +end + +@testset "where with scalar x" begin + function where_scalar_x_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + mask = ta .> 0.5f0 + result = ct.where(mask, 1.0f0, ta) + ct.store(b, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(where_scalar_x_kernel, cld(n, 16), a, b) + + @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, 1.0f0, Array(a)) rtol=1e-5 +end + +@testset "where with broadcasting" begin + function where_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) + mask = ct.load(a, (1, 1), (1, 128)) # (1, 128) mask + tile = ct.load(a, (1, 1), (64, 128)) # (64, 128) tile + result = ct.where(mask .> 0.5f0, tile, 0.0f0) + ct.store(b, (1, 1), result) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m, n) + + ct.launch(where_broadcast_kernel, 1, a, b) + + a_cpu = Array(a) + mask_cpu = a_cpu[1:1, :] .> 0.5f0 + expected = ifelse.(mask_cpu, a_cpu, 0.0f0) + @test Array(b) ≈ expected rtol=1e-5 +end + +@testset "ifelse. same-shape" begin + function ifelse_same_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + result = ifelse.(ta .> tb, ta, tb) + ct.store(c, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(ifelse_same_kernel, cld(n, 16), a, b, c) + + @test Array(c) ≈ ifelse.(Array(a) .> Array(b), Array(a), Array(b)) rtol=1e-5 +end + +@testset "ifelse. with scalar y" begin + function ifelse_scalar_y_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + result = ifelse.(ta .> 0.5f0, ta, 0.0f0) + ct.store(b, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(ifelse_scalar_y_kernel, cld(n, 16), a, b) + + @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, Array(a), 0.0f0) rtol=1e-5 +end + +@testset "ifelse. with both scalars" begin + function ifelse_both_scalar_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + result = ifelse.(ta .> 0.5f0, 1.0f0, 0.0f0) + ct.store(b, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(ifelse_both_scalar_kernel, cld(n, 16), a, b) + + @test Array(b) ≈ ifelse.(Array(a) .> 0.5f0, 1.0f0, 0.0f0) rtol=1e-5 +end + +@testset "ifelse. with broadcasting shapes" begin + function ifelse_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) + col_mask = ct.load(a, (1, 1), (64, 1)) # (64, 1) column + tile = ct.load(a, (1, 1), (64, 128)) # (64, 128) tile + result = ifelse.(col_mask .> 0.5f0, tile, 0.0f0) + ct.store(b, (1, 1), result) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m, n) + + ct.launch(ifelse_broadcast_kernel, 1, a, b) + + a_cpu = Array(a) + mask_cpu = a_cpu[:, 1:1] .> 0.5f0 + expected = ifelse.(mask_cpu, a_cpu, 0.0f0) + @test Array(b) ≈ expected rtol=1e-5 +end + +end # where / ifelse broadcasting + +@testset "max / min broadcasting" begin + +@testset "max. float tile-tile" begin + function max_float_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + result = max.(ta, tb) + ct.store(c, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(max_float_kernel, cld(n, 16), a, b, c) + + @test Array(c) ≈ max.(Array(a), Array(b)) rtol=1e-5 +end + +@testset "min. float tile-tile" begin + function min_float_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + result = min.(ta, tb) + ct.store(c, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(min_float_kernel, cld(n, 16), a, b, c) + + @test Array(c) ≈ min.(Array(a), Array(b)) rtol=1e-5 +end + +@testset "max. float tile-scalar (ReLU)" begin + function relu_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + result = max.(ta, 0.0f0) + ct.store(b, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) .- 0.5f0 # Mix of positive and negative + b = CUDA.zeros(Float32, n) + + ct.launch(relu_kernel, cld(n, 16), a, b) + + @test Array(b) ≈ max.(Array(a), 0.0f0) rtol=1e-5 +end + +@testset "min. float tile-scalar" begin + function clamp_max_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + result = min.(ta, 1.0f0) + ct.store(b, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) .* 2.0f0 # Values in [0, 2] + b = CUDA.zeros(Float32, n) + + ct.launch(clamp_max_kernel, cld(n, 16), a, b) + + @test Array(b) ≈ min.(Array(a), 1.0f0) rtol=1e-5 +end + +@testset "max. integer tile-tile (signed)" begin + function max_int_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, + c::ct.TileArray{Int32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + result = max.(ta, tb) + ct.store(c, pid, result) + return + end + + n = 1024 + a = CuArray(rand(Int32(-100):Int32(100), n)) + b = CuArray(rand(Int32(-100):Int32(100), n)) + c = CUDA.zeros(Int32, n) + + ct.launch(max_int_kernel, cld(n, 16), a, b, c) + + @test Array(c) == max.(Array(a), Array(b)) +end + +@testset "min. integer tile-tile (signed)" begin + function min_int_kernel(a::ct.TileArray{Int32,1}, b::ct.TileArray{Int32,1}, + c::ct.TileArray{Int32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + result = min.(ta, tb) + ct.store(c, pid, result) + return + end + + n = 1024 + a = CuArray(rand(Int32(-100):Int32(100), n)) + b = CuArray(rand(Int32(-100):Int32(100), n)) + c = CUDA.zeros(Int32, n) + + ct.launch(min_int_kernel, cld(n, 16), a, b, c) + + @test Array(c) == min.(Array(a), Array(b)) +end + +@testset "max. broadcasting: (64,1) vs (1,128)" begin + function max_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + c::ct.TileArray{Float32,2}) + col_tile = ct.load(a, (1, 1), (64, 1)) + row_tile = ct.load(b, (1, 1), (1, 128)) + result = max.(col_tile, row_tile) + ct.store(c, (1, 1), result) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, 1) + b = CUDA.rand(Float32, 1, n) + c = CUDA.zeros(Float32, m, n) + + ct.launch(max_broadcast_kernel, 1, a, b, c) + + @test Array(c) ≈ max.(Array(a), Array(b)) rtol=1e-5 +end + +end # max / min broadcasting + +@testset "fma broadcasting" begin + +@testset "fma. same-shape" begin + function fma_same_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}, d::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + tc = ct.load(c, pid, (16,)) + result = fma.(ta, tb, tc) + ct.store(d, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.rand(Float32, n) + d = CUDA.zeros(Float32, n) + + ct.launch(fma_same_kernel, cld(n, 16), a, b, c, d) + + @test Array(d) ≈ fma.(Array(a), Array(b), Array(c)) rtol=1e-5 +end + +@testset "fma. with scalar c" begin + function fma_scalar_c_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + result = fma.(ta, tb, 1.0f0) + ct.store(c, pid, result) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + + ct.launch(fma_scalar_c_kernel, cld(n, 16), a, b, c) + + @test Array(c) ≈ fma.(Array(a), Array(b), 1.0f0) rtol=1e-5 +end + +@testset "fma. with broadcasting bias" begin + function fma_broadcast_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}, + bias::ct.TileArray{Float32,2}, c::ct.TileArray{Float32,2}) + ta = ct.load(a, (1, 1), (64, 128)) + tb = ct.load(b, (1, 1), (64, 128)) + tbias = ct.load(bias, (1, 1), (1, 128)) # (1, 128) bias row + result = fma.(ta, tb, tbias) + ct.store(c, (1, 1), result) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.rand(Float32, m, n) + bias = CUDA.rand(Float32, 1, n) + c = CUDA.zeros(Float32, m, n) + + ct.launch(fma_broadcast_kernel, 1, a, b, bias, c) + + @test Array(c) ≈ fma.(Array(a), Array(b), Array(bias)) rtol=1e-5 +end + +end # fma broadcasting + +@testset "multi-arg map" begin + @testset "binary map(+, ...)" begin + function map_add_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, map(+, ta, tb)) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + ct.launch(map_add_kernel, cld(n, 16), a, b, c) + @test Array(c) ≈ Array(a) + Array(b) + end + + @testset "ternary map(fma, ...)" begin + function map_fma_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}, d::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + tc = ct.load(c, pid, (16,)) + ct.store(d, pid, map(fma, ta, tb, tc)) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.rand(Float32, n) + d = CUDA.zeros(Float32, n) + ct.launch(map_fma_kernel, cld(n, 16), a, b, c, d) + @test Array(d) ≈ fma.(Array(a), Array(b), Array(c)) + end + + @testset "nested broadcast a .+ b .* c" begin + function nested_bc_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}, d::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + tc = ct.load(c, pid, (16,)) + ct.store(d, pid, ta .+ tb .* tc) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.rand(Float32, n) + d = CUDA.zeros(Float32, n) + ct.launch(nested_bc_kernel, cld(n, 16), a, b, c, d) + @test Array(d) ≈ Array(a) .+ Array(b) .* Array(c) + end + + @testset "ifelse broadcast" begin + function ifelse_bc_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + mask = ta .> tb + ct.store(c, pid, ifelse.(mask, ta, tb)) + return + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.rand(Float32, n) + c = CUDA.zeros(Float32, n) + ct.launch(ifelse_bc_kernel, cld(n, 16), a, b, c) + @test Array(c) ≈ max.(Array(a), Array(b)) + end +end diff --git a/test/execution/hints.jl b/test/execution/hints.jl new file mode 100644 index 0000000..56d398e --- /dev/null +++ b/test/execution/hints.jl @@ -0,0 +1,236 @@ +using CUDA + +@testset "Entry Hints" begin + +@testset "launch with num_ctas" begin + function vadd_kernel_num_ctas(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_kernel_num_ctas, 64, a, b, c; num_ctas=2) + + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "launch with occupancy" begin + function vadd_kernel_occupancy(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_kernel_occupancy, 64, a, b, c; occupancy=4) + + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "launch with both hints" begin + function vadd_kernel_both_hints(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + tb = ct.load(b, pid, (16,)) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_kernel_both_hints, 64, a, b, c; num_ctas=4, occupancy=8) + + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +end + + +@testset "Load / Store Optimization Hints" begin + +@testset "load with latency hint" begin + function vadd_with_load_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); latency=5) + tb = ct.load(b, pid, (16,); latency=3) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_with_load_latency, 64, a, b, c) + + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "load with allow_tma=false" begin + function vadd_no_tma(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); allow_tma=false) + tb = ct.load(b, pid, (16,); allow_tma=false) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_no_tma, 64, a, b, c) + + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "load with both hints" begin + function vadd_both_load_hints(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,); latency=7, allow_tma=false) + tb = ct.load(b, pid, (16,); latency=4, allow_tma=true) + ct.store(c, pid, ta + tb) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_both_load_hints, 64, a, b, c) + + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +@testset "store with latency hint" begin + function copy_with_store_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + ct.store(b, pid, ta; latency=2) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(copy_with_store_latency, 64, a, b) + + @test Array(b) ≈ Array(a) +end + +@testset "store with allow_tma=false" begin + function copy_no_tma_store(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + ta = ct.load(a, pid, (16,)) + ct.store(b, pid, ta; allow_tma=false) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(copy_no_tma_store, 64, a, b) + + @test Array(b) ≈ Array(a) +end + +@testset "different hints on load and store" begin + function vadd_mixed_hints(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}, + c::ct.TileArray{Float32,1}) + pid = ct.bid(1) + # Load with high latency, no TMA + ta = ct.load(a, pid, (16,); latency=8, allow_tma=false) + tb = ct.load(b, pid, (16,); latency=6, allow_tma=false) + # Store with low latency, allow TMA + ct.store(c, pid, ta + tb; latency=2, allow_tma=true) + return nothing + end + + n = 1024 + a = CUDA.ones(Float32, n) + b = CUDA.ones(Float32, n) .* 2 + c = CUDA.zeros(Float32, n) + + ct.launch(vadd_mixed_hints, 64, a, b, c) + + @test Array(c) ≈ ones(Float32, n) .* 3 +end + +# Pointer-based operations (gather/scatter) with latency hints +@testset "gather with latency hint" begin + function gather_with_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + base = (pid - 1) * 16 + indices = base .+ ct.arange((16,), Int32) + tile = ct.gather(a, indices; latency=5) + ct.store(b, pid, tile) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(gather_with_latency, 64, a, b) + + @test Array(b) ≈ Array(a) +end + +@testset "scatter with latency hint" begin + function scatter_with_latency(a::ct.TileArray{Float32,1}, + b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + base = (pid - 1) * 16 + indices = base .+ ct.arange((16,), Int32) + ct.scatter(b, indices, tile; latency=3) + return nothing + end + + n = 1024 + a = CUDA.rand(Float32, n) + b = CUDA.zeros(Float32, n) + + ct.launch(scatter_with_latency, 64, a, b) + + @test Array(b) ≈ Array(a) +end + +end diff --git a/test/execution/reductions.jl b/test/execution/reductions.jl new file mode 100644 index 0000000..80e305a --- /dev/null +++ b/test/execution/reductions.jl @@ -0,0 +1,569 @@ +using CUDA + +@testset "sum along axis 2" begin + function sum_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + sums = sum(tile; dims=2) + ct.store(b, pid, sums) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(sum_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 + end +end + +@testset "sum along axis 1" begin + function sum_axis1_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (1, pid), (64, 1)) + sums = sum(tile; dims=1) + ct.store(b, pid, sums) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, n) + + ct.launch(sum_axis1_kernel, n, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for j in 1:n + @test b_cpu[j] ≈ sum(a_cpu[:, j]) rtol=1e-3 + end +end + +@testset "maximum along axis 2" begin + function maximum_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + maxes = maximum(tile; dims=2) + ct.store(b, pid, maxes) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(maximum_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ maximum(a_cpu[i, :]) + end +end + +@testset "minimum along axis 2" begin + function minimum_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + mins = minimum(tile; dims=2) + ct.store(b, pid, mins) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(minimum_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ minimum(a_cpu[i, :]) + end +end + +@testset "prod along axis 2" begin + function prod_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + prods = prod(tile; dims=2) + ct.store(b, pid, prods) + return + end + + m, n = 64, 128 + # Use small values to avoid overflow/underflow + a = CuArray(rand(Float32, m, n) .* 0.1f0 .+ 0.95f0) + b = CUDA.zeros(Float32, m) + + ct.launch(prod_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ prod(a_cpu[i, :]) rtol=1e-2 + end +end + +@testset "reduce with custom combiner" begin + function custom_reduce_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + sums = reduce((x, y) -> x + y, tile; dims=2, init=0.0f0) + ct.store(b, pid, sums) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(custom_reduce_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 + end +end + +@testset "map(abs, tile)" begin + function map_abs_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + result = map(abs, tile) + ct.store(b, (pid, 1), result) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) .- 0.5f0 + b = CUDA.zeros(Float32, m, n) + + ct.launch(map_abs_kernel, m, a, b) + + @test Array(b) ≈ abs.(Array(a)) rtol=1e-5 +end + +@testset "mapreduce(abs, +, tile)" begin + function mapreduce_abs_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + sums = mapreduce(abs, +, tile; dims=2, init=0.0f0) + ct.store(b, pid, sums) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) .- 0.5f0 + b = CUDA.zeros(Float32, m) + + ct.launch(mapreduce_abs_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ sum(abs, a_cpu[i, :]) rtol=1e-3 + end +end + +@testset "mapreduce(x -> x * x, +, tile)" begin + function mapreduce_sq_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) + sums = mapreduce(x -> x * x, +, tile; dims=2, init=0.0f0) + ct.store(b, pid, sums) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(mapreduce_sq_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ sum(x -> x^2, a_cpu[i, :]) rtol=1e-3 + end +end + +@testset "dropdims" begin + # Mean-subtract pattern: reduce row to get mean, dropdims the singleton, + # then broadcast-subtract from the original tile and store the column norms. + function dropdims_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,1}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (1, 128)) # (1, 128) + row_sum = sum(tile; dims=2) # (1, 1) + row_sum_1d = dropdims(row_sum; dims=2) # (1,) + ct.store(b, pid, row_sum_1d) + return + end + + m, n = 64, 128 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m) + + ct.launch(dropdims_kernel, m, a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + for i in 1:m + @test b_cpu[i] ≈ sum(a_cpu[i, :]) rtol=1e-3 + end +end + +@testset "1D cumsum (forward)" begin + function cumsum_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = cumsum(tile; dims=1) + ct.store(b, bid, result) + return nothing + end + + sz = 32 + N = 1024 + a = CUDA.rand(Float32, N) + b = CUDA.zeros(Float32, N) + + ct.launch(cumsum_1d_kernel, cld(N, sz), a, b, ct.Constant(sz)) + + # Per-tile cumulative sum + a_cpu = Array(a) + b_cpu = Array(b) + a_reshaped = reshape(a_cpu, sz, :) + expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) + @test b_cpu ≈ vec(expected) rtol=1e-3 +end + +@testset "2D cumsum along axis 1" begin + function cumsum_2d_axis1_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Float32,2}) + pid = ct.bid(1) + tile = ct.load(a, (pid, 1), (4, 8)) + result = cumsum(tile; dims=1) + ct.store(b, (pid, 1), result) + return nothing + end + + m, n = 32, 8 + a = CUDA.rand(Float32, m, n) + b = CUDA.zeros(Float32, m, n) + + ct.launch(cumsum_2d_axis1_kernel, cld(m, 4), a, b) + + a_cpu = Array(a) + b_cpu = Array(b) + # cumsum along dim 1 within each 4-row tile + for bid in 0:(cld(m, 4)-1) + rows = (bid*4+1):(bid*4+4) + for j in 1:n + @test b_cpu[rows, j] ≈ accumulate(+, a_cpu[rows, j]) rtol=1e-3 + end + end +end + +@testset "1D reverse cumsum" begin + function reverse_cumsum_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = cumsum(tile; dims=1, rev=true) + ct.store(b, bid, result) + return nothing + end + + sz = 32 + N = 1024 + a = CUDA.rand(Float32, N) + b = CUDA.zeros(Float32, N) + + ct.launch(reverse_cumsum_kernel, cld(N, sz), a, b, ct.Constant(sz)) + + a_cpu = Array(a) + b_cpu = Array(b) + a_reshaped = reshape(a_cpu, sz, :) + expected = mapslices(x -> reverse(accumulate(+, reverse(x))), a_reshaped, dims=1) + @test b_cpu ≈ vec(expected) rtol=1e-3 +end + +@testset "1D cumprod" begin + function cumprod_1d_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Float32,1}, + tile_size::ct.Constant{Int}) + bid = ct.bid(1) + tile = ct.load(a, bid, (tile_size[],)) + result = cumprod(tile; dims=1) + ct.store(b, bid, result) + return nothing + end + + sz = 32 + N = 1024 + # Use values close to 1.0 to avoid overflow/underflow + a = CuArray(rand(Float32, N) .* 0.1f0 .+ 0.95f0) + b = CUDA.zeros(Float32, N) + + ct.launch(cumprod_1d_kernel, cld(N, sz), a, b, ct.Constant(sz)) + + a_cpu = Array(a) + b_cpu = Array(b) + a_reshaped = reshape(a_cpu, sz, :) + expected = mapslices(x -> accumulate(*, x), a_reshaped, dims=1) + @test b_cpu ≈ vec(expected) rtol=1e-2 +end + +@testset "1D reduce operations" begin + TILE_SIZE = 32 + N = 1024 + + function reduce_sum_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, + tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), sum(ct.load(a, ct.bid(1), (tileSz[],)); dims=1)) + return nothing + end + + function reduce_max_1d(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, + tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), maximum(ct.load(a, ct.bid(1), (tileSz[],)); dims=1)) + return nothing + end + + function cpu_reduce(a_reshaped::AbstractArray{T}, op) where {T} + result = mapslices(op, a_reshaped, dims=1)[:] + # For unsigned sum, apply mask to handle overflow + if T <: Unsigned && op === sum + result .= result .& typemax(T) + end + return result + end + + TEST_TYPES = [Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Float16, Float32, Float64] + + TEST_OPS = [ + (reduce_sum_1d, sum), + (reduce_max_1d, maximum), + ] + + @testset "Type: $elType, Operation: $gpu_kernel" for elType in TEST_TYPES, (gpu_kernel, cpu_op) in TEST_OPS + # Generate input data with type-appropriate ranges to avoid overflow + if elType == UInt8 + a_gpu = CuArray{UInt8}(rand(UInt8(0):UInt8(7), N)) + elseif elType == Int8 + a_gpu = CuArray{Int8}(rand(-3:3, N)) + elseif elType == Int16 + a_gpu = CuArray{Int16}(rand(-800:800, N)) + elseif elType == UInt16 + a_gpu = CuArray{UInt16}(rand(1:2000, N)) + elseif elType <: Integer && elType <: Signed + a_gpu = CuArray{elType}(rand(-1000:1000, N)) + else + a_gpu = CUDA.rand(elType, N) + end + b_gpu = CUDA.zeros(elType, cld(N, TILE_SIZE)) + + ct.launch(gpu_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + + a_cpu = Array(a_gpu) + b_cpu = Array(b_gpu) + a_reshaped = reshape(a_cpu, TILE_SIZE, :) + cpu_result = cpu_reduce(a_reshaped, cpu_op) + + if elType <: AbstractFloat + @test b_cpu ≈ cpu_result rtol=1e-3 + else + @test b_cpu == cpu_result + end + end +end + +@testset "1D scan (cumsum)" begin + TILE_SIZE = 32 + N = 1024 + + function scan_kernel(a::ct.TileArray{T,1}, b::ct.TileArray{T,1}, tileSz::ct.Constant{Int}) where {T} + ct.store(b, ct.bid(1), cumsum(ct.load(a, ct.bid(1), (tileSz[],)); dims=1)) + return nothing + end + + TEST_TYPES = [Float16, Float32, Float64, Int32, Int64, UInt32, UInt64] + + @testset "Type: $elType" for elType in TEST_TYPES + # Type-appropriate input generation (small values to avoid overflow in cumsum) + if elType <: Integer && elType <: Signed + a_gpu = CuArray{elType}(rand(elType(-3):elType(3), N)) + elseif elType <: Integer + a_gpu = CuArray{elType}(rand(elType(0):elType(7), N)) + else + a_gpu = CUDA.rand(elType, N) + end + b_gpu = CUDA.zeros(elType, N) + + ct.launch(scan_kernel, cld(N, TILE_SIZE), a_gpu, b_gpu, ct.Constant(TILE_SIZE)) + + a_cpu = Array(a_gpu) + b_cpu = Array(b_gpu) + + # CPU reference: per-tile cumulative sum + a_reshaped = reshape(a_cpu, TILE_SIZE, :) + expected = mapslices(x -> accumulate(+, x), a_reshaped, dims=1) + + if elType <: AbstractFloat + @test b_cpu ≈ vec(expected) rtol=1e-3 + else + @test b_cpu == vec(expected) + end + end +end + +@testset "any / all" begin + TILE_SIZE = 16 + + function any_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Int32,1}, + tileSz::ct.Constant{Int}) + tile = ct.load(a, ct.bid(1), (tileSz[],)) + mask = tile .> 0.0f0 + result = any(mask; dims=1) + ct.store(b, ct.bid(1), convert(ct.Tile{Int32}, result)) + return nothing + end + + function all_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Int32,1}, + tileSz::ct.Constant{Int}) + tile = ct.load(a, ct.bid(1), (tileSz[],)) + mask = tile .> 0.0f0 + result = all(mask; dims=1) + ct.store(b, ct.bid(1), convert(ct.Tile{Int32}, result)) + return nothing + end + + N = 64 + n_blocks = cld(N, TILE_SIZE) + + # All positive → any=true, all=true + a_pos = CUDA.ones(Float32, N) + b_any = CUDA.zeros(Int32, n_blocks) + b_all = CUDA.zeros(Int32, n_blocks) + ct.launch(any_kernel, n_blocks, a_pos, b_any, ct.Constant(TILE_SIZE)) + ct.launch(all_kernel, n_blocks, a_pos, b_all, ct.Constant(TILE_SIZE)) + @test all(Array(b_any) .== 1) + @test all(Array(b_all) .== 1) + + # All negative → any=false, all=false + a_neg = CUDA.fill(Float32(-1), N) + b_any = CUDA.zeros(Int32, n_blocks) + b_all = CUDA.zeros(Int32, n_blocks) + ct.launch(any_kernel, n_blocks, a_neg, b_any, ct.Constant(TILE_SIZE)) + ct.launch(all_kernel, n_blocks, a_neg, b_all, ct.Constant(TILE_SIZE)) + @test all(Array(b_any) .== 0) + @test all(Array(b_all) .== 0) + + # Mixed → any=true, all=false (first element positive, rest negative) + a_mix = CUDA.fill(Float32(-1), N) + # Set first element of each tile to positive + a_mix_cpu = Array(a_mix) + for i in 1:TILE_SIZE:N + a_mix_cpu[i] = 1.0f0 + end + a_mix = CuArray(a_mix_cpu) + b_any = CUDA.zeros(Int32, n_blocks) + b_all = CUDA.zeros(Int32, n_blocks) + ct.launch(any_kernel, n_blocks, a_mix, b_any, ct.Constant(TILE_SIZE)) + ct.launch(all_kernel, n_blocks, a_mix, b_all, ct.Constant(TILE_SIZE)) + @test all(Array(b_any) .== 1) + @test all(Array(b_all) .== 0) +end + +@testset "count" begin + TILE_SIZE = 16 + + function count_kernel(a::ct.TileArray{Float32,1}, b::ct.TileArray{Int32,1}, + tileSz::ct.Constant{Int}) + tile = ct.load(a, ct.bid(1), (tileSz[],)) + result = count(tile .> 0.0f0; dims=1) + ct.store(b, ct.bid(1), result) + return nothing + end + + N = 64 + n_blocks = cld(N, TILE_SIZE) + + # Known pattern: 3 positive per tile + a_cpu = fill(Float32(-1), N) + for i in 1:TILE_SIZE:N + a_cpu[i] = 1.0f0 + a_cpu[i+1] = 2.0f0 + a_cpu[i+2] = 3.0f0 + end + a = CuArray(a_cpu) + b = CUDA.zeros(Int32, n_blocks) + + ct.launch(count_kernel, n_blocks, a, b, ct.Constant(TILE_SIZE)) + + @test all(Array(b) .== 3) +end + +@testset "argmax / argmin" begin + TILE_SIZE = 16 + + function argmax_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Int32,2}) + tile = ct.load(a, ct.bid(1), (4, 16)) + result = argmax(tile; dims=2) + ct.store(b, ct.bid(1), result) + return nothing + end + + function argmin_kernel(a::ct.TileArray{Float32,2}, b::ct.TileArray{Int32,2}) + tile = ct.load(a, ct.bid(1), (4, 16)) + result = argmin(tile; dims=2) + ct.store(b, ct.bid(1), result) + return nothing + end + + m, n = 4, 16 + # Create data with known argmax/argmin positions + a_cpu = zeros(Float32, m, n) + for row in 1:m + for col in 1:n + a_cpu[row, col] = Float32(col) # max at col 16, min at col 1 + end + end + a = CuArray(a_cpu) + b_max = CUDA.zeros(Int32, m, 1) + b_min = CUDA.zeros(Int32, m, 1) + + ct.launch(argmax_kernel, 1, a, b_max) + ct.launch(argmin_kernel, 1, a, b_min) + + b_max_cpu = Array(b_max) + b_min_cpu = Array(b_min) + + # argmax should return 16 (1-indexed) for all rows + @test all(b_max_cpu .== 16) + # argmin should return 1 (1-indexed) for all rows + @test all(b_min_cpu .== 1) + + # Test with random data + a_rand = CUDA.rand(Float32, m, n) + b_max_rand = CUDA.zeros(Int32, m, 1) + b_min_rand = CUDA.zeros(Int32, m, 1) + + ct.launch(argmax_kernel, 1, a_rand, b_max_rand) + ct.launch(argmin_kernel, 1, a_rand, b_min_rand) + + a_rand_cpu = Array(a_rand) + # Compare with CPU argmax/argmin (Julia returns CartesianIndex, extract column) + for row in 1:m + expected_max = argmax(a_rand_cpu[row, :]) + expected_min = argmin(a_rand_cpu[row, :]) + @test Array(b_max_rand)[row, 1] == expected_max + @test Array(b_min_rand)[row, 1] == expected_min + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 3d0e0d1..9330bab 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -43,7 +43,7 @@ args = parse_args(ARGS) if filter_tests!(testsuite, args) cuda_functional = CUDA.functional() filter!(testsuite) do (test, _) - if in(test, ["execution"]) || startswith(test, "examples/") + if startswith(test, "execution/") || startswith(test, "examples/") return cuda_functional else return true From 4bb5cc3d3cb6f1ee885809b3c9d81c4dacff6dbd Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Mon, 9 Feb 2026 10:19:16 +0100 Subject: [PATCH 2/2] Add workspace. --- Project.toml | 3 +++ examples/Project.toml | 6 ++++++ 2 files changed, 9 insertions(+) create mode 100644 examples/Project.toml diff --git a/Project.toml b/Project.toml index 1942cd8..dd1c4ea 100644 --- a/Project.toml +++ b/Project.toml @@ -3,6 +3,9 @@ uuid = "0dea8319-8c4a-4662-a73d-20234d115b9a" version = "0.1.0" authors = ["Tim Besard "] +[workspace] +projects = ["test", "examples"] + [deps] BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b" CompilerCaching = "9db33cc3-5358-4881-8759-fa4194144afd" diff --git a/examples/Project.toml b/examples/Project.toml new file mode 100644 index 0000000..877d0e4 --- /dev/null +++ b/examples/Project.toml @@ -0,0 +1,6 @@ +[deps] +FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +cuTile = "0dea8319-8c4a-4662-a73d-20234d115b9a" + +[sources] +cuTile = {path = ".."}