diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl index ee2ef26..02b7c38 100644 --- a/src/compiler/codegen/expressions.jl +++ b/src/compiler/codegen/expressions.jl @@ -34,17 +34,7 @@ In Tile IR codegen, only ghost types (zero-size immutables like `Val{V}`, """ function emit_new!(ctx::CGCtx, expr::Expr, @nospecialize(result_type)) T = CC.widenconst(result_type) - - # Ghost types: no runtime representation - if is_ghost_type(T) - if T <: Val && length(T.parameters) == 1 - return ghost_value(T, T.parameters[1]) - elseif T <: Constant && length(T.parameters) >= 2 - return ghost_value(T, T.parameters[2]) - end - return ghost_value(T) - end - + is_ghost_type(T) && return ghost_value(T) throw(IRError("Struct construction not supported in Tile IR: $T")) end @@ -69,7 +59,7 @@ function emit_rhs!(ctx::CGCtx, @nospecialize(rhs), @nospecialize(result_type)) elseif rhs isa QuoteNode return emit_constant!(ctx, rhs.value, result_type) elseif rhs isa GlobalRef - return nothing + return emit_value!(ctx, rhs) else return emit_constant!(ctx, rhs, result_type) end diff --git a/src/compiler/codegen/utils.jl b/src/compiler/codegen/utils.jl index 7503d1c..2676dd4 100644 --- a/src/compiler/codegen/utils.jl +++ b/src/compiler/codegen/utils.jl @@ -102,6 +102,16 @@ Optionally stores a compile-time constant value. ghost_value(@nospecialize(jltype)) = CGVal(nothing, TypeId(-1), jltype, Int[], nothing, nothing, nothing) ghost_value(@nospecialize(jltype), constant) = CGVal(nothing, TypeId(-1), jltype, Int[], nothing, Some(constant), nothing) +""" + constant_value(jltype, type_id, constant) -> CGVal + +Deferred constant: has a Tile IR type but no bytecode value yet. +Materialized (ConstantOp emitted) on demand at SSA lookup time. +Parallel to Julia's non-ghost cgval from mark_julia_const. +""" +constant_value(@nospecialize(jltype), type_id::TypeId, constant) = + CGVal(nothing, type_id, jltype, Int[], nothing, Some(constant), nothing) + """ tuple_value(jltype, component_refs, component_constants) -> CGVal @@ -278,15 +288,22 @@ function require_concrete_type(@nospecialize(T), context::String) end """ - tile_type_for_julia!(ctx, T) -> TypeId + tile_type_for_julia!(ctx, T; throw_error=true) -> TypeId or nothing -Get or create a Tile IR type for a Julia type. +Get or create a Tile IR type for a Julia type. With `throw_error=false`, returns +`nothing` instead of throwing if the type has no Tile IR representation. """ -function tile_type_for_julia!(ctx::CGCtx, @nospecialize(T)) +function tile_type_for_julia!(ctx::CGCtx, @nospecialize(T); throw_error::Bool=true) actual_type = CC.widenconst(T) - get!(ctx.type_cache, actual_type) do - _tile_type_for_julia!(ctx.tt, actual_type) + cached = get(ctx.type_cache, actual_type, nothing) + cached !== nothing && return cached + type_id = _tile_type_for_julia!(ctx.tt, actual_type) + if type_id !== nothing + ctx.type_cache[actual_type] = type_id + return type_id end + throw_error && throw(IRError("Unsupported Julia type for Tile IR: $actual_type")) + return nothing end function _tile_type_for_julia!(tt::TypeTable, @nospecialize(T::Type)) @@ -333,7 +350,7 @@ function _tile_type_for_julia!(tt::TypeTable, @nospecialize(T::Type)) return tile_type!(tt, elem_dtype, shape) end - throw(IRError("Unsupported Julia type for Tile IR: $T")) + return nothing end """ @@ -414,17 +431,6 @@ function extract_argument_index(@nospecialize(arg)) nothing end -function resolve_or_constant(ctx::CGCtx, @nospecialize(arg), type_id::TypeId) - tv = emit_value!(ctx, arg) - # If we have a runtime value, use it - tv.v !== nothing && return tv.v - # Otherwise emit a constant from the compile-time value - tv.constant === nothing && throw(IRError("Cannot resolve argument")) - val = something(tv.constant) - bytes = reinterpret(UInt8, [Int32(val)]) - encode_ConstantOp!(ctx.cb, type_id, collect(bytes)) -end - #----------------------------------------------------------------------------- # Tile helpers #----------------------------------------------------------------------------- diff --git a/src/compiler/codegen/values.jl b/src/compiler/codegen/values.jl index 7286ae0..64c3c27 100644 --- a/src/compiler/codegen/values.jl +++ b/src/compiler/codegen/values.jl @@ -7,8 +7,27 @@ Emit/resolve a value reference to a CGVal using multiple dispatch. """ function emit_value!(ctx::CGCtx, ssa::SSAValue) tv = ctx[ssa] - tv !== nothing && return tv - throw(IRError("SSAValue %$(ssa.id) not found in context")) + tv !== nothing || throw(IRError("SSAValue %$(ssa.id) not found in context")) + return maybe_materialize!(ctx, ssa, tv) +end + +""" + maybe_materialize!(ctx, ssa, tv) -> CGVal + +Materialize a deferred constant into bytecode on demand. +Only acts on CGVals with `type_id != TypeId(-1)` and no value yet (deferred constants). +""" +function maybe_materialize!(ctx::CGCtx, ssa::SSAValue, tv::CGVal) + tv.v !== nothing && return tv # already materialized + tv.type_id == TypeId(-1) && return tv # ghost — nothing to materialize + tv.constant === nothing && return tv # no constant to materialize + + val = something(tv.constant) + bytes = constant_to_bytes(val, CC.widenconst(tv.jltype)) + v = encode_ConstantOp!(ctx.cb, tv.type_id, bytes) + materialized = CGVal(v, tv.type_id, tv.jltype, Int[], nothing, tv.constant, nothing) + ctx[ssa] = materialized + return materialized end emit_value!(ctx::CGCtx, arg::Argument) = ctx[arg] emit_value!(ctx::CGCtx, slot::SlotNumber) = ctx[slot] @@ -61,7 +80,17 @@ end function emit_value!(ctx::CGCtx, ref::GlobalRef) val = getfield(ref.mod, ref.name) - ghost_value(typeof(val), val) + T = typeof(val) + # Ghost types have no materializable representation. + # Non-ghost types with a Tile IR type become deferred constants (materialized on demand). + # Everything else (functions, enums, etc.) is compile-time only → ghost. + if !is_ghost_type(T) + type_id = tile_type_for_julia!(ctx, T; throw_error=false) + if type_id !== nothing + return constant_value(T, type_id, val) + end + end + ghost_value(T, val) end function emit_value!(ctx::CGCtx, node::PiNode) @@ -73,7 +102,7 @@ function emit_value!(ctx::CGCtx, node::PiNode) emit_value!(ctx, node.val) end -emit_value!(ctx::CGCtx, ::Nothing) = nothing +emit_value!(ctx::CGCtx, ::Nothing) = ghost_value(Nothing, nothing) """ get_constant(ctx, ref) -> Union{Any, Nothing} @@ -89,7 +118,14 @@ function get_constant(ctx::CGCtx, @nospecialize(ref)) end # IR references - extract constant through emit_value! tv = emit_value!(ctx, ref) - tv === nothing ? nothing : (tv.constant === nothing ? nothing : something(tv.constant)) + tv === nothing && return nothing + if tv.constant !== nothing + return something(tv.constant) + end + # Any ghost singleton can be reconstructed from its type + T = CC.widenconst(tv.jltype) + is_ghost_type(T) && isdefined(T, :instance) && return T.instance + return nothing end # Symbols are compile-time only values @@ -107,15 +143,8 @@ emit_value!(ctx::CGCtx, @nospecialize(val::Type)) = ghost_value(Type{val}, val) # Fallback for other types (constants embedded in IR) function emit_value!(ctx::CGCtx, @nospecialize(val)) T = typeof(val) - # Handle Val{V} instances - if T <: Val && length(T.parameters) == 1 - return ghost_value(T, T.parameters[1]) - end - # Handle Constant{T, V} instances - if T <: Constant && length(T.parameters) >= 2 - return ghost_value(T, T.parameters[2]) - end - throw(IRError("Unhandled value type in emit_value!: $(typeof(val))")) + is_ghost_type(T) && return ghost_value(T, val) + throw(IRError("Unsupported value type in Tile IR codegen: $T")) end diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl index 59628d0..bdc6e8a 100644 --- a/src/compiler/interface.jl +++ b/src/compiler/interface.jl @@ -73,6 +73,16 @@ CC.may_optimize(::cuTileInterpreter) = true CC.may_compress(::cuTileInterpreter) = true CC.may_discard_trees(::cuTileInterpreter) = false +#============================================================================= + Custom return-type inference (tfuncs) for intrinsics +=============================================================================# + +# Per-intrinsic return type overrides using multiple dispatch. +# Returns nothing when no override applies (fallback). +# Concrete per-intrinsic methods are defined in intrinsics/ (after the +# Intrinsics module exists). +tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing + #============================================================================= Subprogram inference for reduce/scan =============================================================================# @@ -147,7 +157,8 @@ function _infer_subprogram(interp::cuTileInterpreter, @nospecialize(f), end end -# Override abstract_call_known to trigger subprogram inference for reduce/scan. +# Override abstract_call_known for custom return-type inference (tfuncs) and +# subprogram inference for reduce/scan. # # On 1.12+, abstract_call_known returns Future{CallMeta}. The caller uses the # CallMeta.info to populate stmt_info[pc], which compute_edges! later walks. @@ -161,16 +172,18 @@ end result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any, arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing}, sv::CC.InferenceState, max_methods::Int) + rt_override = tfunc(f, arginfo.argtypes) subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv) - subprog === nothing && return result + rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() push!(sv.tasks, function (interp′, sv′) isready(result) || return false - isready(subprog) || return false + subprog !== nothing && !isready(subprog) && return false cm = result[] - sp = subprog[] - wrapped[] = CC.CallMeta(cm.rt, cm.exct, cm.effects, - SubprogramCallInfo(cm.info, sp.info), cm.refinements) + sp = subprog !== nothing ? subprog[] : nothing + rt = rt_override !== nothing ? rt_override : cm.rt + info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info + wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements) return true end) return wrapped @@ -182,16 +195,18 @@ elseif isdefined(CC, :Future) # 1.12–1.13 result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any, arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.InferenceState, max_methods::Int) + rt_override = tfunc(f, arginfo.argtypes) subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv) - subprog === nothing && return result + rt_override === nothing && subprog === nothing && return result wrapped = CC.Future{CC.CallMeta}() push!(sv.tasks, function (interp′, sv′) isready(result) || return false - isready(subprog) || return false + subprog !== nothing && !isready(subprog) && return false cm = result[] - sp = subprog[] - wrapped[] = CC.CallMeta(cm.rt, cm.exct, cm.effects, - SubprogramCallInfo(cm.info, sp.info), cm.refinements) + sp = subprog !== nothing ? subprog[] : nothing + rt = rt_override !== nothing ? rt_override : cm.rt + info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info + wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements) return true end) return wrapped @@ -204,6 +219,11 @@ else # 1.11: synchronous, edges auto-tracked via stmt_edges arginfo::CC.ArgInfo, si::CC.StmtInfo, sv::CC.AbsIntState, max_methods::Int) _infer_subprogram(interp, f, arginfo, si, nothing, sv) # side-effect only + rt_override = tfunc(f, arginfo.argtypes) + if rt_override !== nothing + return CC.CallMeta(rt_override, result.exct, result.effects, + result.info) + end return result end end diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl index 9beed19..aa0d425 100644 --- a/src/compiler/intrinsics.jl +++ b/src/compiler/intrinsics.jl @@ -19,6 +19,15 @@ end # Sometimes that's not possible, e.g., because the functionality required for that is # overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those # intrinsics we disable constant folding using a `compilerbarrier(:const)` +# +# NOTE: Side-effectful intrinsics (stores, atomics) use `donotdelete(args...)` in their +# bodies to prevent the optimizer from DCE'ing calls. `donotdelete` is a Julia builtin +# with `effect_free=ALWAYS_FALSE`, which inference propagates through the function body. +# `@assume_effects !:effect_free` does NOT work — `override_effects` can only strengthen +# effects (set ALWAYS_TRUE), not weaken them. Spoofing `ipo_effects` via a custom +# `CC.finish!` override is possible but fragile (must race against `finishinfer!` setting +# `use_const_api` based on pre-override effects). `donotdelete` is the simplest correct +# approach. emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl index 6366910..6272251 100644 --- a/src/compiler/intrinsics/arithmetic.jl +++ b/src/compiler/intrinsics/arithmetic.jl @@ -97,7 +97,7 @@ end # cuda_tile.addi @eval Intrinsics begin @noinline addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y) - @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args) emit_binop!(ctx, args, encode_AddIOp!) @@ -105,7 +105,7 @@ end # cuda_tile.cldi (ceiling division, toward positive infinity) @eval Intrinsics begin - @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = (donotdelete(x, y, s); compilerbarrier(:const, zero(T))) + @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T)) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness")) @@ -130,7 +130,7 @@ end end end @noinline cmpi(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate, ::Signedness) where {T<:Integer, S} = - (donotdelete(a, b); Tile{Bool, S}()) + Tile{Bool, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args) cb = ctx.cb @@ -168,7 +168,7 @@ end # cuda_tile.fldi (floor division, toward negative infinity) @eval Intrinsics begin - @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = (donotdelete(x, y, s); compilerbarrier(:const, zero(T))) + @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T)) end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness")) @@ -182,7 +182,7 @@ end ifelse(lt, y, x) end @noinline maxi(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = - (donotdelete(a, b); Tile{T, S}()) + Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness")) @@ -196,7 +196,7 @@ end ifelse(lt, x, y) end @noinline mini(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = - (donotdelete(a, b); Tile{T, S}()) + Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args) signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness")) @@ -206,7 +206,7 @@ end # cuda_tile.muli @eval Intrinsics begin @noinline muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y) - @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args) emit_binop!(ctx, args, encode_MulIOp!) @@ -218,7 +218,7 @@ end @noinline function mulhii(x::T, y::T, s::Signedness) where {T<:Integer} ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T end - @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args) emit_binop!(ctx, args, encode_MulhiIOp!) @@ -266,7 +266,7 @@ end # cuda_tile.subi @eval Intrinsics begin @noinline subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y) - @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args) emit_binop!(ctx, args, encode_SubIOp!) @@ -287,7 +287,7 @@ end # cuda_tile.addf @eval Intrinsics begin @noinline addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y) - @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args) emit_binop!(ctx, args, encode_AddFOp!) @@ -311,7 +311,7 @@ end end end @noinline cmpf(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate) where {T<:AbstractFloat, S} = - (donotdelete(a, b); Tile{Bool, S}()) + Tile{Bool, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args) cb = ctx.cb @@ -338,7 +338,7 @@ end # cuda_tile.divf @eval Intrinsics begin @noinline divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y) - @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args) emit_binop!(ctx, args, encode_DivFOp!) @@ -347,7 +347,7 @@ end # cuda_tile.mulf @eval Intrinsics begin @noinline mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y) - @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args) emit_binop!(ctx, args, encode_MulFOp!) @@ -365,7 +365,7 @@ end # cuda_tile.subf @eval Intrinsics begin @noinline subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y) - @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args) emit_binop!(ctx, args, encode_SubFOp!) @@ -378,7 +378,7 @@ end @eval Intrinsics begin @noinline andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y) """Element-wise logical AND for boolean tiles.""" - @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = (donotdelete(a, b); Tile{Bool, S}()) + @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args) cb = ctx.cb @@ -399,7 +399,7 @@ end @eval Intrinsics begin @noinline ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y) """Element-wise logical OR for boolean tiles.""" - @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = (donotdelete(a, b); Tile{Bool, S}()) + @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args) cb = ctx.cb @@ -420,7 +420,7 @@ end @eval Intrinsics begin @noinline xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y) """Element-wise logical XOR for boolean tiles.""" - @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = (donotdelete(a, b); Tile{Bool, S}()) + @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args) cb = ctx.cb diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl index e31d966..3c89bd4 100644 --- a/src/compiler/intrinsics/atomics.jl +++ b/src/compiler/intrinsics/atomics.jl @@ -41,7 +41,7 @@ end """ @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired, memory_order::Int, memory_scope::Int) where {T, N} - donotdelete(array, index, expected, desired) + donotdelete() compilerbarrier(:const, zero(T))::T end end @@ -179,7 +179,7 @@ end """ @noinline function atomic_xchg(array::TileArray{T, N}, index, val, memory_order::Int, memory_scope::Int) where {T, N} - donotdelete(array, index, val) + donotdelete() compilerbarrier(:const, zero(T)) end end @@ -198,7 +198,7 @@ end """ @noinline function atomic_add(array::TileArray{T, N}, index, val, memory_order::Int, memory_scope::Int) where {T, N} - donotdelete(array, index, val) + donotdelete() compilerbarrier(:const, zero(T)) end end diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl index 197e13b..5b522c4 100644 --- a/src/compiler/intrinsics/conversions.jl +++ b/src/compiler/intrinsics/conversions.jl @@ -10,7 +10,6 @@ cuda_tile.exti, or cuda_tile.trunci based on source/target types. """ @noinline function astype(tile::Tile{T1, Shape}, ::Type{T2}) where {T1, Shape, T2} - donotdelete(tile) Tile{T2, Shape}() end end diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl index 25b6c34..bcb3eca 100644 --- a/src/compiler/intrinsics/core.jl +++ b/src/compiler/intrinsics/core.jl @@ -26,10 +26,20 @@ end Explicitly broadcast a tile to a target shape. Compiled to cuda_tile.broadcast. """ - @noinline function broadcast(tile::Tile{T}, ::Val{Shape}) where {T, Shape} - Tile{T, Tuple{Shape...}}() + @noinline function broadcast(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N} + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + tile_type = CC.widenconst(argtypes[2]) + tile_type <: Tile || return nothing + shape_arg = argtypes[3] + isa(shape_arg, CC.Const) || return nothing + shape = shape_arg.val + T = eltype(tile_type) + return Tile{T, Tuple{shape...}} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.broadcast), args) cb = ctx.cb tt = ctx.tt @@ -42,7 +52,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.broadcast), args) source_type = CC.widenconst(source.jltype) source_elem = eltype(source_type) - # Extract target shape from the constant tuple argument + # Extract target shape target_shape_tuple = get_constant(ctx, args[2]) target_shape_tuple isa Tuple || throw(IRError("broadcast() shape must be a compile-time constant tuple")) target_shape = collect(Int, target_shape_tuple) @@ -106,19 +116,28 @@ end Concatenate two tiles along 0-indexed axis. Compiled to cuda_tile.cat. """ - @noinline function cat(tiles::Tuple{Tile{T}, Tile{T}}, ::Val{Axis}) where {T, Axis} - t1, t2 = tiles - n = ndims(t1) - axis = Axis < 0 ? n + Axis : Axis - result_shape = ntuple(n) do i - if i == axis + 1 # 0-indexed axis, 1-indexed tuple access - size(t1, i) + size(t2, i) - else - size(t1, i) - end - end - Tile{T, Tuple{result_shape...}}() - end + @noinline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Integer) where {T, S1, S2} + compilerbarrier(:type, nothing) + end +end +function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + tuple_type = CC.widenconst(argtypes[2]) + tuple_type <: Tuple{Tile, Tile} || return nothing + axis_arg = argtypes[3] + isa(axis_arg, CC.Const) || return nothing + axis = axis_arg.val + t1_type = tuple_type.parameters[1] + t2_type = tuple_type.parameters[2] + (t1_type <: Tile && t2_type <: Tile) || return nothing + T = eltype(t1_type) + s1 = size(t1_type) + s2 = size(t2_type) + isempty(s1) && return nothing + n = length(s1) + a = axis < 0 ? n + axis : axis + result_shape = ntuple(i -> i == a + 1 ? s1[i] + s2[i] : s1[i], n) + return Tile{T, Tuple{result_shape...}} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args) cb = ctx.cb @@ -137,7 +156,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args) rhs = emit_value!(ctx, tuple_tv.tuple[2]) (lhs === nothing || rhs === nothing) && throw(IRError("Cannot resolve tile operands for cat()")) - # Get axis from Val{Axis} + # Get axis axis_val = get_constant(ctx, args[2]) axis_val isa Integer || throw(IRError("cat() axis must be a compile-time constant integer")) @@ -175,9 +194,19 @@ end Compiled to cuda_tile.constant. """ @noinline function constant(shape::NTuple{N, Int}, value, ::Type{T}) where {N, T} - Tile{T, Tuple{shape...}}() + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any}) + length(argtypes) >= 4 || return nothing + shape_arg = argtypes[2] + isa(shape_arg, CC.Const) || return nothing + shape = shape_arg.val + type_arg = CC.widenconst(argtypes[4]) + type_arg <: Type || return nothing + T = type_arg.parameters[1] + return Tile{T, Tuple{shape...}} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.constant), args) cb = ctx.cb tt = ctx.tt @@ -214,10 +243,20 @@ end Extract a sub-tile from tile at 0-indexed slice indices. Compiled to cuda_tile.extract. """ - @noinline function extract(tile::Tile{T}, ::Val{Index}, ::Val{Shape}) where {T, Index, Shape} - Tile{T, Tuple{Shape...}}() + @noinline function extract(tile::Tile{T}, index::NTuple{N, Int}, shape::NTuple{N, Int}) where {T, N} + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any}) + length(argtypes) >= 4 || return nothing + tile_type = CC.widenconst(argtypes[2]) + tile_type <: Tile || return nothing + shape_arg = argtypes[4] + isa(shape_arg, CC.Const) || return nothing + shape = shape_arg.val + T = eltype(tile_type) + return Tile{T, Tuple{shape...}} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.extract), args) cb = ctx.cb tt = ctx.tt @@ -226,11 +265,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.extract), args) source = emit_value!(ctx, args[1]) source === nothing && throw(IRError("Cannot resolve source operand for extract()")) - # Extract index from Val{Index} argument + # Extract index index_tuple = get_constant(ctx, args[2]) index_tuple isa Tuple || throw(IRError("extract() index must be a compile-time constant tuple")) - # Extract shape from Val{Shape} argument + # Extract shape shape_tuple = get_constant(ctx, args[3]) shape_tuple isa Tuple || throw(IRError("extract() shape must be a compile-time constant tuple")) output_shape = collect(Int, shape_tuple) @@ -312,9 +351,19 @@ end Compiled to cuda_tile.iota. """ @noinline function iota(shape::NTuple{1, Int}, ::Type{T}) where {T} - Tile{T, Tuple{shape...}}() + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + shape_arg = argtypes[2] + isa(shape_arg, CC.Const) || return nothing + shape = shape_arg.val + type_arg = CC.widenconst(argtypes[3]) + type_arg <: Type || return nothing + T = type_arg.parameters[1] + return Tile{T, Tuple{shape...}} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args) cb = ctx.cb tt = ctx.tt @@ -427,11 +476,22 @@ end Permute tile dimensions according to 0-indexed permutation. Compiled to cuda_tile.permute. """ - @noinline function permute(tile::Tile{T}, ::Val{Perm}) where {T, Perm} - # Compute permuted shape: for each position i in output, take size(tile, Perm[i]+1) - permuted_shape = ntuple(i -> size(tile, Perm[i] + 1), ndims(tile)) - Tile{T, Tuple{permuted_shape...}}() - end + @noinline function permute(tile::Tile{T, S}, perm::NTuple{N, Int}) where {T, S, N} + compilerbarrier(:type, nothing) + end +end +function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + tile_type = CC.widenconst(argtypes[2]) + tile_type <: Tile || return nothing + perm_arg = argtypes[3] + isa(perm_arg, CC.Const) || return nothing + perm = perm_arg.val + s = size(tile_type) + isempty(s) && return nothing + T = eltype(tile_type) + permuted_shape = ntuple(i -> s[perm[i] + 1], length(perm)) + return Tile{T, Tuple{permuted_shape...}} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args) cb = ctx.cb @@ -444,7 +504,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args) input_shape = source.shape isempty(input_shape) && throw(IRError("Cannot determine tile shape for permute()")) - # Extract permutation from Val{Perm} argument + # Extract permutation perm_tuple = get_constant(ctx, args[2]) perm_tuple isa Tuple || throw(IRError("permute() permutation must be a compile-time constant tuple")) @@ -477,9 +537,18 @@ end Compiled to cuda_tile.permute with perm=(1, 0). """ @noinline function transpose(tile::Tile{T}) where {T} - Tile{T, Tuple{reverse(size(tile))...}}() + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any}) + length(argtypes) >= 2 || return nothing + tile_type = CC.widenconst(argtypes[2]) + tile_type <: Tile || return nothing + s = size(tile_type) + isempty(s) && return nothing + T = eltype(tile_type) + return Tile{T, Tuple{reverse(s)...}} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.transpose), args) cb = ctx.cb tt = ctx.tt @@ -516,18 +585,32 @@ end callers wrap in 1-tuples and unwrap with `[1]`. Compiled to cuda_tile.reduce. """ - @noinline function reduce(tiles::Tuple{Tile{T}}, ::Val{axis}, f, - identities::Tuple{Any}) where {T, axis} - tile = tiles[1] - reduced_shape = ntuple(i -> i == axis + 1 ? 1 : size(tile, i), ndims(tile)) - (Tile{T, Tuple{reduced_shape...}}(),) - end - @noinline function reduce(tiles::Tuple{Tile{T}, Tile, Vararg{Tile}}, ::Val{axis}, f, - identities::Tuple{Any, Any, Vararg{Any}}) where {T, axis} - tile = tiles[1] - reduced_shape = ntuple(i -> i == axis + 1 ? 1 : size(tile, i), ndims(tile)) - (Tile{T, Tuple{reduced_shape...}}(), reduce(Base.tail(tiles), Val(axis), f, Base.tail(identities))...) - end + @noinline function reduce(tiles::Tuple{Tile{T, S}}, axis::Integer, f, + identities::Tuple{Any}) where {T, S} + compilerbarrier(:type, nothing) + end + @noinline function reduce(tiles::Tuple{Tile{T1, S}, Tile{T2, S}}, axis::Integer, f, + identities::Tuple{Any, Any}) where {T1, T2, S} + compilerbarrier(:type, nothing) + end +end +function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + tuple_type = CC.widenconst(argtypes[2]) + tuple_type isa DataType && tuple_type <: Tuple || return nothing + axis_arg = argtypes[3] + isa(axis_arg, CC.Const) || return nothing + axis = axis_arg.val + result_params = Any[] + for p in tuple_type.parameters + p isa DataType && p <: Tile || return nothing + T = eltype(p) + s = size(p) + isempty(s) && return nothing + reduced_shape = ntuple(i -> i == axis + 1 ? 1 : s[i], length(s)) + push!(result_params, Tile{T, Tuple{reduced_shape...}}) + end + return Tuple{result_params...} end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce), args) emit_reduce!(ctx, args) @@ -547,7 +630,7 @@ function emit_reduce!(ctx::CGCtx, args) N = length(tile_tvs) # Get reduction axis - axis = @something get_constant(ctx, args[2]) throw(IRError("Reduction axis must be a compile-time constant")) + axis = get_constant(ctx, args[2]) # Resolve combiner function func = get_constant(ctx, args[3]) @@ -648,10 +731,20 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer = Reshape a tile to a new shape (same total elements). Compiled to cuda_tile.reshape. """ - @noinline function reshape(tile::Tile{T}, ::Val{Shape}) where {T, Shape} - Tile{T, Tuple{Shape...}}() + @noinline function reshape(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N} + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + tile_type = CC.widenconst(argtypes[2]) + tile_type <: Tile || return nothing + shape_arg = argtypes[3] + isa(shape_arg, CC.Const) || return nothing + shape = shape_arg.val + T = eltype(tile_type) + return Tile{T, Tuple{shape...}} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args) cb = ctx.cb tt = ctx.tt @@ -660,7 +753,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args) source = emit_value!(ctx, args[1]) source === nothing && throw(IRError("Cannot resolve source operand for reshape()")) - # Extract target shape from Val{Shape} argument + # Extract target shape target_shape_tuple = get_constant(ctx, args[2]) target_shape_tuple isa Tuple || throw(IRError("reshape() shape must be a compile-time constant tuple")) target_shape = collect(Int, target_shape_tuple) @@ -720,11 +813,22 @@ end `reverse=true` for a reverse (suffix) scan. Compiled to cuda_tile.scan. """ - @noinline function scan(tiles::Tuple{Tile{T, S}}, ::Val{axis}, f, - identities::Tuple{Any}, reverse::Bool=false) where {T, S, axis} - (Tile{T, S}(),) + @noinline function scan(tiles::Tuple{Tile{T, S}}, axis::Integer, f, + identities::Tuple{Any}, reverse::Bool=false) where {T, S} + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any}) + length(argtypes) >= 2 || return nothing + tuple_type = CC.widenconst(argtypes[2]) + tuple_type isa DataType && tuple_type <: Tuple || return nothing + result_params = Any[] + for p in tuple_type.parameters + p isa DataType && p <: Tile || return nothing + push!(result_params, p) + end + return Tuple{result_params...} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) cb = ctx.cb tt = ctx.tt @@ -740,7 +844,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args) N = length(tile_tvs) # Get scan axis - axis = @something get_constant(ctx, args[2]) throw(IRError("Scan axis must be a compile-time constant")) + axis = get_constant(ctx, args[2]) # Resolve combiner function func = get_constant(ctx, args[3]) @@ -844,11 +948,17 @@ end # to_scalar: jltype becomes scalar T (for overlay dispatch), but IR value stays shaped. # from_scalar: restores jltype to Tile{T, S}. @eval Intrinsics begin - @noinline to_scalar(tile::Tile{T, S}) where {T, S} = (donotdelete(tile); compilerbarrier(:const, T(0))) - # S is a tuple TYPE (e.g., Tuple{16}) passed through from the input tile - @noinline from_scalar(x::T, ::Val{S}) where {T, S} = (donotdelete(x); Tile{T, S}()) + @noinline to_scalar(tile::Tile{T, S}) where {T, S} = compilerbarrier(:const, T(0)) + @noinline from_scalar(x::T, ::Type{S}) where {T, S} = Tile{T, S}() +end +function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + T = CC.widenconst(argtypes[2]) + shape_type = CC.widenconst(argtypes[3]) + shape_type <: Type || return nothing + S = shape_type.parameters[1] + return Tile{T, S} end - function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.to_scalar), args) tv = emit_value!(ctx, args[1]) tv === nothing && throw(IRError("Cannot resolve tile for to_scalar")) @@ -861,9 +971,9 @@ end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.from_scalar), args) tv = emit_value!(ctx, args[1]) tv === nothing && throw(IRError("Cannot resolve scalar for from_scalar")) - shape_val = @something get_constant(ctx, args[2]) throw(IRError("from_scalar shape must be constant")) + shape_type = get_constant(ctx, args[2]) T = CC.widenconst(tv.jltype) - CGVal(tv.v, tv.type_id, Tile{T, shape_val}, tv.shape, nothing, nothing, nothing) + CGVal(tv.v, tv.type_id, Tile{T, shape_type}, tv.shape, nothing, nothing, nothing) end # TODO: cuda_tile.unpack diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl index f2572d1..ded13df 100644 --- a/src/compiler/intrinsics/math.jl +++ b/src/compiler/intrinsics/math.jl @@ -81,8 +81,8 @@ end # cuda_tile.fma @eval Intrinsics begin """Fused multiply-add: a * b + c. Compiled to cuda_tile.fma.""" - @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = (donotdelete(y, z); compilerbarrier(:const, x)) - @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b, c); Tile{T, S}()) + @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) + @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args) cb = ctx.cb @@ -135,7 +135,7 @@ end # cuda_tile.maxf @eval Intrinsics begin @noinline maxf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x > y || isnan(x), x, y) - @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args) emit_binop!(ctx, args, encode_MaxFOp!) @@ -144,7 +144,7 @@ end # cuda_tile.minf @eval Intrinsics begin @noinline minf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x < y || isnan(x), x, y) - @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args) emit_binop!(ctx, args, encode_MinFOp!) @@ -153,8 +153,8 @@ end # cuda_tile.pow @eval Intrinsics begin """Element-wise power. Compiled to cuda_tile.pow.""" - @noinline pow(x::T, y::T) where {T<:AbstractFloat} = (donotdelete(x, y); compilerbarrier(:const, x)) - @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline pow(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) + @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args) emit_binop!(ctx, args, encode_PowOp!) @@ -164,7 +164,7 @@ end @eval Intrinsics begin """Element-wise floating-point remainder. Compiled to cuda_tile.remf.""" @noinline remf(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x) - @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}()) + @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}() end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args) emit_binop!(ctx, args, encode_RemFOp!) diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl index 3bae03d..1d42ad5 100644 --- a/src/compiler/intrinsics/memory.jl +++ b/src/compiler/intrinsics/memory.jl @@ -18,7 +18,6 @@ latency::Union{Int, Nothing}=nothing, mask::Union{Tile{Bool, S}, Nothing}=nothing, padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S} - donotdelete(ptrs, latency, mask, padding) Tile{T, S}() end end @@ -96,7 +95,7 @@ end @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S}, latency::Union{Int, Nothing}, mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S} - donotdelete(ptrs, values, latency, mask) + donotdelete() nothing end end diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl index 9fc00b3..c8f1a88 100644 --- a/src/compiler/intrinsics/views.jl +++ b/src/compiler/intrinsics/views.jl @@ -32,7 +32,6 @@ end Compiled to cuda_tile.get_index_space_shape. """ @noinline function get_index_space_shape(pv::PartitionView{T, N, Shape}, axis::Integer) where {T, N, Shape} - donotdelete(pv) compilerbarrier(:const, zero(Int32)) end end @@ -81,11 +80,20 @@ end latency::Union{Int, Nothing}, allow_tma::Bool, indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - donotdelete(pv, latency, allow_tma) - # Shape is already a tuple TYPE (e.g., Tuple{64}) from make_partition_view - Tile{T, Shape}() + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any}) + length(argtypes) >= 2 || return nothing + pv_type = CC.widenconst(argtypes[2]) + pv_type <: PartitionView || return nothing + pv_type isa DataType || return nothing + length(pv_type.parameters) >= 3 || return nothing + T = eltype(pv_type) + Shape = pv_type.parameters[3] + Shape isa Type || return nothing + return Tile{T, Shape} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), args) cb = ctx.cb tt = ctx.tt @@ -169,21 +177,31 @@ end make_partition_view(tv::TensorView, shape_val, padding_mode, order) -> PartitionView Create a PartitionView from a TensorView with the given tile shape. - The `order` parameter (Val{NTuple{N,Int}} or Val{nothing}) specifies + The `order` parameter (NTuple{N,Int} or nothing) specifies the logical-to-physical dimension mapping (1-indexed), or identity if nothing. Compiled to cuda_tile.make_partition_view. """ - @noinline function make_partition_view(tv::TensorView{T, N}, ::Val{Shape}, padding_mode::Int, ::Val{Order}) where {T, N, Shape, Order} - donotdelete(tv) - PartitionView{T, N, Tuple{Shape...}}() + @noinline function make_partition_view(tv::TensorView{T, N}, shape::NTuple{M, Int}, padding_mode::Int, order) where {T, N, M} + compilerbarrier(:type, nothing) end end +function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any}) + length(argtypes) >= 3 || return nothing + tv_type = CC.widenconst(argtypes[2]) + tv_type <: TensorView || return nothing + shape_arg = argtypes[3] + isa(shape_arg, CC.Const) || return nothing + shape = shape_arg.val + T = eltype(tv_type) + N = ndims(tv_type) + return PartitionView{T, N, Tuple{shape...}} +end function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_partition_view), args) tv = emit_value!(ctx, args[1]) tv === nothing && throw(IRError("make_partition_view() requires a TensorView argument")) - # User boundary: Val{Shape} contains VALUE tuple from user call (e.g., load(arr, idx, (16,))) - shape = @something get_constant(ctx, args[2]) throw(IRError("make_partition_view() shape must be a compile-time constant")) + # Shape from user call (e.g., load(arr, idx, (16,))) + shape = get_constant(ctx, args[2]) shape isa Tuple || throw(IRError("make_partition_view() shape must be a tuple, got $(typeof(shape))")) tile_shape = collect(Int, shape) validate_tile_shape(tile_shape, "load") @@ -195,11 +213,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_partition_view), a elem_type = eltype(tv.jltype) ndim = length(tile_shape) - # Extract order from Val{Order} (arg 4) - # Val{nothing} → identity dim_map, Val{(2,1)} → [1, 0] (1-indexed → 0-indexed) - order_arg = emit_value!(ctx, args[4]) - order_jltype = CC.widenconst(order_arg.jltype) - order_val = order_jltype.parameters[1] # nothing or NTuple{N,Int} + # Extract order (arg 4) + # nothing → identity dim_map, (2,1) → [1, 0] (1-indexed → 0-indexed) + order_val = get_constant(ctx, args[4]) if order_val === nothing dim_map = collect(0:ndim-1) else @@ -328,7 +344,6 @@ end Compiled to cuda_tile.make_tensor_view. """ @noinline function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N} - donotdelete(arr) TensorView{T, N}() end end @@ -359,11 +374,11 @@ end Compiled to cuda_tile.store_view_tko. """ @noinline function store_partition_view(pv::PartitionView{T, N, Shape}, - tile::Tile{T, Shape}, + tile::Tile{T}, latency::Union{Int, Nothing}, allow_tma::Bool, indices::NTuple{M, <:Integer}) where {T, N, Shape, M} - donotdelete(pv, tile, latency, allow_tma) + donotdelete() nothing end end diff --git a/src/language/operations.jl b/src/language/operations.jl index b368db4..0d49a7e 100644 --- a/src/language/operations.jl +++ b/src/language/operations.jl @@ -61,7 +61,7 @@ Axis is 1-indexed. Equivalent to cld(arr.sizes[axis], shape[axis]). """ @inline function num_tiles(arr::TileArray, axis::Integer, shape::NTuple{<:Any, Int}) tv = Intrinsics.make_tensor_view(arr) - pv = Intrinsics.make_partition_view(tv, Val(shape), PaddingMode.Undetermined, Val(nothing)) + pv = Intrinsics.make_partition_view(tv, shape, PaddingMode.Undetermined, nothing) Intrinsics.get_index_space_shape(pv, axis - One()) # convert to 0-indexed end @@ -103,7 +103,7 @@ tile = ct.load(arr, (bidx, bidy), (TM, TN); order=(2, 1)) latency::Union{Int, Nothing}=nothing, allow_tma::Bool=true) tv = Intrinsics.make_tensor_view(arr) - pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode, Val(order)) + pv = Intrinsics.make_partition_view(tv, shape, padding_mode, order) Intrinsics.load_partition_view(pv, latency, allow_tma, promote(index...) .- One()) end @@ -113,7 +113,7 @@ end latency::Union{Int, Nothing}=nothing, allow_tma::Bool=true) tv = Intrinsics.make_tensor_view(arr) - pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode, Val(order)) + pv = Intrinsics.make_partition_view(tv, shape, padding_mode, order) Intrinsics.load_partition_view(pv, latency, allow_tma, (index - One(),)) end @@ -125,7 +125,7 @@ end allow_tma::Bool=true) shape_val = _extract_shape(shape) tv = Intrinsics.make_tensor_view(arr) - pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode, Val(order)) + pv = Intrinsics.make_partition_view(tv, shape_val, padding_mode, order) Intrinsics.load_partition_view(pv, latency, allow_tma, promote(index...) .- One()) end @@ -137,7 +137,7 @@ end allow_tma::Bool=true) shape_val = _extract_shape(shape) tv = Intrinsics.make_tensor_view(arr) - pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode, Val(order)) + pv = Intrinsics.make_partition_view(tv, shape_val, padding_mode, order) Intrinsics.load_partition_view(pv, latency, allow_tma, promote(index...) .- One()) end @@ -184,7 +184,7 @@ Returns the stored tile (enables chaining and helps constant folding). latency::Union{Int, Nothing}=nothing, allow_tma::Bool=true) where {T} reshaped = _reshape_for_store(tile, Val(ndims(arr))) - _store_reshaped(arr, reshaped, Val(order), latency, allow_tma, promote(index...) .- One()) + _store_reshaped(arr, reshaped, order, latency, allow_tma, promote(index...) .- One()) return tile # XXX: enables constant folding; remove when possible (see "constant folding" test) end @@ -193,14 +193,14 @@ end latency::Union{Int, Nothing}=nothing, allow_tma::Bool=true) where {T} reshaped = _reshape_for_store(tile, Val(ndims(arr))) - _store_reshaped(arr, reshaped, Val(order), latency, allow_tma, (index - One(),)) + _store_reshaped(arr, reshaped, order, latency, allow_tma, (index - One(),)) return tile # XXX: enables constant folding; remove when possible (see "constant folding" test) end @inline function _store_reshaped(arr::TileArray{T}, tile::Tile{T}, - ::Val{Order}, latency, allow_tma, indices::NTuple{<:Any, <:Integer}) where {T, Order} + order, latency, allow_tma, indices::NTuple{<:Any, <:Integer}) where {T} tv = Intrinsics.make_tensor_view(arr) - pv = Intrinsics.make_partition_view(tv, Val(size(tile)), PaddingMode.Undetermined, Val(Order)) + pv = Intrinsics.make_partition_view(tv, size(tile), PaddingMode.Undetermined, order) Intrinsics.store_partition_view(pv, tile, latency, allow_tma, indices) end @@ -464,11 +464,11 @@ combined_last = ct.cat((tile_a, tile_b), -1) """ @inline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Int) where {T, S1, S2} axis0 = axis < 0 ? axis : axis - 1 - Intrinsics.cat(tiles, Val(axis0)) + Intrinsics.cat(tiles, axis0) end @inline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, ::Val{Axis}) where {T, S1, S2, Axis} axis0 = Axis < 0 ? Axis : Axis - 1 - Intrinsics.cat(tiles, Val(axis0)) + Intrinsics.cat(tiles, axis0) end """ @@ -483,7 +483,7 @@ expanded = ct.broadcast_to(row, (64, 128)) # Shape (64, 128) ``` """ @inline broadcast_to(tile::Tile{T}, shape::NTuple{<:Any, Int}) where {T} = - Intrinsics.broadcast(tile, Val(shape)) + Intrinsics.broadcast(tile, shape) """ reshape(tile::Tile{T, S}, shape::NTuple{N, Int}) -> Tile{T, shape} @@ -497,7 +497,7 @@ reshaped = reshape(tile, (2, 16)) # Shape (2, 16), still 32 elements ``` """ @inline Base.reshape(tile::Tile{T}, shape::NTuple{<:Any, Int}) where {T} = - Intrinsics.reshape(tile, Val(shape)) + Intrinsics.reshape(tile, shape) """ permutedims(tile::Tile{T, S}, perm) -> Tile{T, permuted_shape} @@ -512,9 +512,9 @@ permuted = permutedims(tile, (3, 1, 2)) # Shape (4, 2, 3) ``` """ @inline Base.permutedims(tile::Tile{T}, perm::NTuple{<:Any, Int}) where {T} = - Intrinsics.permute(tile, Val(map(p -> p - 1, perm))) + Intrinsics.permute(tile, map(p -> p - 1, perm)) @inline Base.permutedims(tile::Tile{T}, ::Val{Perm}) where {T, Perm} = - Intrinsics.permute(tile, Val(map(p -> p - 1, Perm))) + Intrinsics.permute(tile, map(p -> p - 1, Perm)) """ permutedims(tile::Tile{T, (M, N)}) -> Tile{T, (N, M)} @@ -537,9 +537,9 @@ Differs from `transpose` in that the operation is not recursive. first_dim = n >= 1 ? size(T, 1) : nothing if n == 2 - return :(Intrinsics.permute(tile, Val((1, 0)))) + return :(Intrinsics.permute(tile, (1, 0))) elseif n == 1 - return :(Intrinsics.reshape(tile, Val((1, $first_dim)))) + return :(Intrinsics.reshape(tile, (1, $first_dim))) else return :(throw(ArgumentError("permutedims(tile) only works for 1D or 2D tiles"))) end @@ -592,7 +592,7 @@ result = map(+, a, b) # Element-wise addition (same shape required) ``` """ @inline function Base.map(f, a::Tile{<:Any,S}, rest::Tile{<:Any,S}...) where {S} - Intrinsics.from_scalar(f(Intrinsics.to_scalar(a), map(Intrinsics.to_scalar, rest)...), Val(S)) + Intrinsics.from_scalar(f(Intrinsics.to_scalar(a), map(Intrinsics.to_scalar, rest)...), S) end """ @@ -624,7 +624,7 @@ vals, idxs = mapreduce(identity, reducer, vals_tile, idx_tile; """ @inline function Base.mapreduce(::typeof(identity), f, tile::Tile{T,S}; dims::Integer, init) where {T<:Number, S} - Intrinsics.reduce((tile,), Val(dims - 1), f, (T(init),))[1] + Intrinsics.reduce((tile,), dims - 1, f, (T(init),))[1] end @inline function Base.mapreduce(f, op, tile::Tile{T,S}; @@ -641,7 +641,7 @@ end function _combiner(args...) f(_deinterleave_accs(args...), _deinterleave_elems(args...)) end - Intrinsics.reduce(all_tiles, Val(dims - 1), _combiner, init) + Intrinsics.reduce(all_tiles, dims - 1, _combiner, init) end """ @@ -827,7 +827,7 @@ Supported functions: `+`, `*`, `max`, `min`. """ @inline function Base.accumulate(f, tile::Tile{T,S}; dims::Integer, init, rev::Bool=false) where {T<:Number, S} - Intrinsics.scan((tile,), Val(dims - 1), f, (T(init),), rev)[1] + Intrinsics.scan((tile,), dims - 1, f, (T(init),), rev)[1] end """ @@ -921,6 +921,6 @@ br = ct.extract(tile, (2, 2), (4, 4)) # Bottom-right (rows 5-8, cols 5-8) ``` """ @inline extract(tile::Tile{T}, index::NTuple{<:Any, Int}, shape::NTuple{<:Any, Int}) where {T} = - Intrinsics.extract(tile, Val(map(i -> i - 1, index)), Val(shape)) + Intrinsics.extract(tile, map(i -> i - 1, index), shape) @inline extract(tile::Tile{T}, ::Val{Index}, ::Val{Shape}) where {T, Index, Shape} = - Intrinsics.extract(tile, Val(map(i -> i - 1, Index)), Val(Shape)) + Intrinsics.extract(tile, map(i -> i - 1, Index), Shape) diff --git a/test/codegen.jl b/test/codegen.jl index ff984cf..a27bbf3 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -343,13 +343,18 @@ @testset "mixed-type integer comparison" begin @test @filecheck begin @check_label "entry" - code_tiled(Tuple{}) do + code_tiled(Tuple{ct.TileArray{Int64,1,spec1d}}) do out a = ct.arange((16,), Int64) b = ct.arange((16,), Int32) # Should promote Int32 to Int64 and compare @check "exti" @check "cmpi" + @check "select" result = a .< b + # Use same-typed operands for where to avoid Union type + b_promoted = ct.astype(b, Int64) + selected = ct.where(result, a, b_promoted) + ct.store(out, Int32(0), selected) return end end @@ -1070,6 +1075,32 @@ end end + @testset "power operations" begin + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "pow" + Base.donotdelete(tile .^ tile) + return + end + end + + # scalar exponent + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "broadcast" + @check "pow" + Base.donotdelete(tile .^ 2.0f0) + return + end + end + end + @testset "scalar math functions" begin # Test scalar math functions via overlays (sin, exp, sqrt, etc. on scalars) # Note: We pass scalar args to avoid constant folding at compile time @@ -1324,11 +1355,16 @@ spec = ct.ArraySpec{2}(16, true) @test @filecheck begin @check_label "entry" - code_tiled(Tuple{ct.TileArray{Float32,2,spec}}) do a + code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}}) do a, b @check "make_tensor_view" @check "make_partition_view" @check "get_index_space_shape" num = ct.num_tiles(a, 1, (32, 32)) + # Use num as a tile index to prevent DCE + @check "load_view_tko" + tile = ct.load(a, (num, Int32(0)), (32, 32)) + @check "store_view_tko" + ct.store(b, (Int32(0), Int32(0)), tile) return end end @@ -1894,6 +1930,20 @@ end @testset "method error detection" begin spec = ct.ArraySpec{1}(16, true) + isdefined(Core, :throw_methoderror) && + @testset "mismatched tile shapes with + produces MethodError" begin + spec2d = ct.ArraySpec{2}(16, true) + @test_throws "MethodError during Tile IR compilation" begin + code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a + pid = ct.bid(1) + tile_a = ct.load(a, pid, (4, 8)) + tile_b = ct.load(a, pid, (8, 4)) + Base.donotdelete(tile_a + tile_b) + return + end + end + end + isdefined(Core, :throw_methoderror) && @testset "no matching method produces MethodError" begin only_ints(x::Int) = x @@ -2020,6 +2070,69 @@ end end end +#============================================================================= + External Constants (GlobalRef handling) +=============================================================================# + +# Constants defined outside the kernel (module-level `const`) appear as GlobalRef +# nodes in Julia IR. These must emit proper ConstantOp for numeric types, +# not ghost values (which produce nothing in the bytecode). + +const _CODEGEN_TEST_FLOAT32 = Float32(1 / log(2)) +const _CODEGEN_TEST_FLOAT64 = 3.14159 + +@testset "External Constants" begin + spec1d = ct.ArraySpec{1}(16, true) + + @testset "external Float32 constant in arithmetic" begin + # Bug 1: GlobalRef for Float32 must emit ConstantOp, not a ghost value. + # Previously, emit_value!(ctx, ::GlobalRef) wrapped all values as ghosts, + # causing MulFOp to receive `nothing` instead of a bytecode Value. + @test @filecheck begin + @check_label "entry" + code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a + pid = ct.bid(1) + tile = ct.load(a, pid, (16,)) + @check "constant tile) isa ct.Tile{Bool, Tuple{16}} - @test (tile .<= tile) isa ct.Tile{Bool, Tuple{16}} - @test (tile .>= tile) isa ct.Tile{Bool, Tuple{16}} - @test (tile .== tile) isa ct.Tile{Bool, Tuple{16}} - @test (tile .!= tile) isa ct.Tile{Bool, Tuple{16}} -end - -@testset "integer comparison operators" begin - int_tile = ct.arange((16,), Int) - - @test (int_tile .< int_tile) isa ct.Tile{Bool, Tuple{16}} - @test (int_tile .> int_tile) isa ct.Tile{Bool, Tuple{16}} - @test (int_tile .<= int_tile) isa ct.Tile{Bool, Tuple{16}} - @test (int_tile .>= int_tile) isa ct.Tile{Bool, Tuple{16}} - @test (int_tile .== int_tile) isa ct.Tile{Bool, Tuple{16}} - @test (int_tile .!= int_tile) isa ct.Tile{Bool, Tuple{16}} -end - -@testset "tile vs scalar comparison" begin - int_tile = ct.arange((16,), Int) - float_tile = ct.Tile{Float32, Tuple{16}}() - - @test (int_tile .< 10) isa ct.Tile{Bool, Tuple{16}} - @test (5 .< int_tile) isa ct.Tile{Bool, Tuple{16}} - - @test (float_tile .< 2.0f0) isa ct.Tile{Bool, Tuple{16}} - @test (1.0f0 .> float_tile) isa ct.Tile{Bool, Tuple{16}} -end - -@testset "broadcast comparison shapes" begin - tile_a = ct.Tile{Float32, Tuple{1, 16}}() - tile_b = ct.Tile{Float32, Tuple{8, 1}}() - - result = tile_a .< tile_b - @test result isa ct.Tile{Bool, Tuple{8, 16}} -end - -end - -@testset "power operations" begin - -@testset "float tile .^ float tile" begin - tile = ct.Tile{Float32, Tuple{16}}() - @test (tile .^ tile) isa ct.Tile{Float32, Tuple{16}} -end - -@testset "float tile .^ scalar" begin - tile = ct.Tile{Float32, Tuple{16}}() - @test (tile .^ 2.0f0) isa ct.Tile{Float32, Tuple{16}} - @test (2.0f0 .^ tile) isa ct.Tile{Float32, Tuple{16}} -end - -@testset "broadcast power shapes" begin - tile_a = ct.Tile{Float32, Tuple{1, 16}}() - tile_b = ct.Tile{Float32, Tuple{8, 1}}() - @test (tile_a .^ tile_b) isa ct.Tile{Float32, Tuple{8, 16}} -end - -@testset "integer power dispatches through generic broadcast" begin - int_tile = ct.arange((16,), Int) - # Generic copy→map accepts this (no MethodError), but it will fail - # at codegen time since there's no ^ overlay for integers. - @test (int_tile .^ int_tile) isa ct.Tile -end - -end - -@testset "multi-arg map" begin - a = ct.Tile{Float32, Tuple{16}}() - b = ct.Tile{Float32, Tuple{16}}() - c = ct.Tile{Float32, Tuple{16}}() - - # Binary map - @test map(+, a, b) isa ct.Tile{Float32, Tuple{16}} - - # Ternary map - @test map(fma, a, b, c) isa ct.Tile{Float32, Tuple{16}} - - # Broadcasting goes through the .op path, not map directly - @test (a .+ 1.0f0) isa ct.Tile{Float32, Tuple{16}} - @test (1.0f0 .+ a) isa ct.Tile{Float32, Tuple{16}} - - # Broadcasting with different shapes goes through .op path - row = ct.Tile{Float32, Tuple{4, 1}}() - col = ct.Tile{Float32, Tuple{1, 16}}() - @test (row .+ col) isa ct.Tile{Float32, Tuple{4, 16}} - - # Nested broadcast expression: a .+ b .* c - @test (a .+ b .* c) isa ct.Tile{Float32, Tuple{16}} -end