diff --git a/src/compiler/codegen/expressions.jl b/src/compiler/codegen/expressions.jl
index ee2ef26..02b7c38 100644
--- a/src/compiler/codegen/expressions.jl
+++ b/src/compiler/codegen/expressions.jl
@@ -34,17 +34,7 @@ In Tile IR codegen, only ghost types (zero-size immutables like `Val{V}`,
 """
 function emit_new!(ctx::CGCtx, expr::Expr, @nospecialize(result_type))
     T = CC.widenconst(result_type)
-
-    # Ghost types: no runtime representation
-    if is_ghost_type(T)
-        if T <: Val && length(T.parameters) == 1
-            return ghost_value(T, T.parameters[1])
-        elseif T <: Constant && length(T.parameters) >= 2
-            return ghost_value(T, T.parameters[2])
-        end
-        return ghost_value(T)
-    end
-
+    is_ghost_type(T) && return ghost_value(T)
     throw(IRError("Struct construction not supported in Tile IR: $T"))
 end
 
@@ -69,7 +59,7 @@ function emit_rhs!(ctx::CGCtx, @nospecialize(rhs), @nospecialize(result_type))
     elseif rhs isa QuoteNode
         return emit_constant!(ctx, rhs.value, result_type)
     elseif rhs isa GlobalRef
-        return nothing
+        return emit_value!(ctx, rhs)
     else
         return emit_constant!(ctx, rhs, result_type)
     end
diff --git a/src/compiler/codegen/utils.jl b/src/compiler/codegen/utils.jl
index 7503d1c..2676dd4 100644
--- a/src/compiler/codegen/utils.jl
+++ b/src/compiler/codegen/utils.jl
@@ -102,6 +102,16 @@ Optionally stores a compile-time constant value.
 ghost_value(@nospecialize(jltype)) = CGVal(nothing, TypeId(-1), jltype, Int[], nothing, nothing, nothing)
 ghost_value(@nospecialize(jltype), constant) = CGVal(nothing, TypeId(-1), jltype, Int[], nothing, Some(constant), nothing)
 
+"""
+    constant_value(jltype, type_id, constant) -> CGVal
+
+Deferred constant: has a Tile IR type but no bytecode value yet.
+Materialized (ConstantOp emitted) on demand at SSA lookup time.
+Parallel to Julia's non-ghost cgval from mark_julia_const.
+"""
+constant_value(@nospecialize(jltype), type_id::TypeId, constant) =
+    CGVal(nothing, type_id, jltype, Int[], nothing, Some(constant), nothing)
+
 """
     tuple_value(jltype, component_refs, component_constants) -> CGVal
 
@@ -278,15 +288,22 @@ function require_concrete_type(@nospecialize(T), context::String)
 end
 
 """
-    tile_type_for_julia!(ctx, T) -> TypeId
+    tile_type_for_julia!(ctx, T; throw_error=true) -> TypeId or nothing
 
-Get or create a Tile IR type for a Julia type.
+Get or create a Tile IR type for a Julia type. With `throw_error=false`, returns
+`nothing` instead of throwing if the type has no Tile IR representation.
 """
-function tile_type_for_julia!(ctx::CGCtx, @nospecialize(T))
+function tile_type_for_julia!(ctx::CGCtx, @nospecialize(T); throw_error::Bool=true)
     actual_type = CC.widenconst(T)
-    get!(ctx.type_cache, actual_type) do
-        _tile_type_for_julia!(ctx.tt, actual_type)
+    cached = get(ctx.type_cache, actual_type, nothing)
+    cached !== nothing && return cached
+    type_id = _tile_type_for_julia!(ctx.tt, actual_type)
+    if type_id !== nothing
+        ctx.type_cache[actual_type] = type_id
+        return type_id
     end
+    throw_error && throw(IRError("Unsupported Julia type for Tile IR: $actual_type"))
+    return nothing
 end
 
 function _tile_type_for_julia!(tt::TypeTable, @nospecialize(T::Type))
@@ -333,7 +350,7 @@ function _tile_type_for_julia!(tt::TypeTable, @nospecialize(T::Type))
         return tile_type!(tt, elem_dtype, shape)
     end
 
-    throw(IRError("Unsupported Julia type for Tile IR: $T"))
+    return nothing
 end
 
 """
@@ -414,17 +431,6 @@ function extract_argument_index(@nospecialize(arg))
     nothing
 end
 
-function resolve_or_constant(ctx::CGCtx, @nospecialize(arg), type_id::TypeId)
-    tv = emit_value!(ctx, arg)
-    # If we have a runtime value, use it
-    tv.v !== nothing && return tv.v
-    # Otherwise emit a constant from the compile-time value
-    tv.constant === nothing && throw(IRError("Cannot resolve argument"))
-    val = something(tv.constant)
-    bytes = reinterpret(UInt8, [Int32(val)])
-    encode_ConstantOp!(ctx.cb, type_id, collect(bytes))
-end
-
 #-----------------------------------------------------------------------------
 # Tile helpers
 #-----------------------------------------------------------------------------
diff --git a/src/compiler/codegen/values.jl b/src/compiler/codegen/values.jl
index 7286ae0..64c3c27 100644
--- a/src/compiler/codegen/values.jl
+++ b/src/compiler/codegen/values.jl
@@ -7,8 +7,27 @@ Emit/resolve a value reference to a CGVal using multiple dispatch.
 """
 function emit_value!(ctx::CGCtx, ssa::SSAValue)
     tv = ctx[ssa]
-    tv !== nothing && return tv
-    throw(IRError("SSAValue %$(ssa.id) not found in context"))
+    tv !== nothing || throw(IRError("SSAValue %$(ssa.id) not found in context"))
+    return maybe_materialize!(ctx, ssa, tv)
+end
+
+"""
+    maybe_materialize!(ctx, ssa, tv) -> CGVal
+
+Materialize a deferred constant into bytecode on demand.
+Only acts on CGVals with `type_id != TypeId(-1)` and no value yet (deferred constants).
+"""
+function maybe_materialize!(ctx::CGCtx, ssa::SSAValue, tv::CGVal)
+    tv.v !== nothing && return tv               # already materialized
+    tv.type_id == TypeId(-1) && return tv       # ghost — nothing to materialize
+    tv.constant === nothing && return tv         # no constant to materialize
+
+    val = something(tv.constant)
+    bytes = constant_to_bytes(val, CC.widenconst(tv.jltype))
+    v = encode_ConstantOp!(ctx.cb, tv.type_id, bytes)
+    materialized = CGVal(v, tv.type_id, tv.jltype, Int[], nothing, tv.constant, nothing)
+    ctx[ssa] = materialized
+    return materialized
 end
 emit_value!(ctx::CGCtx, arg::Argument) = ctx[arg]
 emit_value!(ctx::CGCtx, slot::SlotNumber) = ctx[slot]
@@ -61,7 +80,17 @@ end
 
 function emit_value!(ctx::CGCtx, ref::GlobalRef)
     val = getfield(ref.mod, ref.name)
-    ghost_value(typeof(val), val)
+    T = typeof(val)
+    # Ghost types have no materializable representation.
+    # Non-ghost types with a Tile IR type become deferred constants (materialized on demand).
+    # Everything else (functions, enums, etc.) is compile-time only → ghost.
+    if !is_ghost_type(T)
+        type_id = tile_type_for_julia!(ctx, T; throw_error=false)
+        if type_id !== nothing
+            return constant_value(T, type_id, val)
+        end
+    end
+    ghost_value(T, val)
 end
 
 function emit_value!(ctx::CGCtx, node::PiNode)
@@ -73,7 +102,7 @@ function emit_value!(ctx::CGCtx, node::PiNode)
     emit_value!(ctx, node.val)
 end
 
-emit_value!(ctx::CGCtx, ::Nothing) = nothing
+emit_value!(ctx::CGCtx, ::Nothing) = ghost_value(Nothing, nothing)
 
 """
     get_constant(ctx, ref) -> Union{Any, Nothing}
@@ -89,7 +118,14 @@ function get_constant(ctx::CGCtx, @nospecialize(ref))
     end
     # IR references - extract constant through emit_value!
     tv = emit_value!(ctx, ref)
-    tv === nothing ? nothing : (tv.constant === nothing ? nothing : something(tv.constant))
+    tv === nothing && return nothing
+    if tv.constant !== nothing
+        return something(tv.constant)
+    end
+    # Any ghost singleton can be reconstructed from its type
+    T = CC.widenconst(tv.jltype)
+    is_ghost_type(T) && isdefined(T, :instance) && return T.instance
+    return nothing
 end
 
 # Symbols are compile-time only values
@@ -107,15 +143,8 @@ emit_value!(ctx::CGCtx, @nospecialize(val::Type)) = ghost_value(Type{val}, val)
 # Fallback for other types (constants embedded in IR)
 function emit_value!(ctx::CGCtx, @nospecialize(val))
     T = typeof(val)
-    # Handle Val{V} instances
-    if T <: Val && length(T.parameters) == 1
-        return ghost_value(T, T.parameters[1])
-    end
-    # Handle Constant{T, V} instances
-    if T <: Constant && length(T.parameters) >= 2
-        return ghost_value(T, T.parameters[2])
-    end
-    throw(IRError("Unhandled value type in emit_value!: $(typeof(val))"))
+    is_ghost_type(T) && return ghost_value(T, val)
+    throw(IRError("Unsupported value type in Tile IR codegen: $T"))
 end
 
 
diff --git a/src/compiler/interface.jl b/src/compiler/interface.jl
index 59628d0..bdc6e8a 100644
--- a/src/compiler/interface.jl
+++ b/src/compiler/interface.jl
@@ -73,6 +73,16 @@ CC.may_optimize(::cuTileInterpreter) = true
 CC.may_compress(::cuTileInterpreter) = true
 CC.may_discard_trees(::cuTileInterpreter) = false
 
+#=============================================================================
+ Custom return-type inference (tfuncs) for intrinsics
+=============================================================================#
+
+# Per-intrinsic return type overrides using multiple dispatch.
+# Returns nothing when no override applies (fallback).
+# Concrete per-intrinsic methods are defined in intrinsics/ (after the
+# Intrinsics module exists).
+tfunc(@nospecialize(f), argtypes::Vector{Any}) = nothing
+
 #=============================================================================
  Subprogram inference for reduce/scan
 =============================================================================#
@@ -147,7 +157,8 @@ function _infer_subprogram(interp::cuTileInterpreter, @nospecialize(f),
     end
 end
 
-# Override abstract_call_known to trigger subprogram inference for reduce/scan.
+# Override abstract_call_known for custom return-type inference (tfuncs) and
+# subprogram inference for reduce/scan.
 #
 # On 1.12+, abstract_call_known returns Future{CallMeta}. The caller uses the
 # CallMeta.info to populate stmt_info[pc], which compute_edges! later walks.
@@ -161,16 +172,18 @@ end
         result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any,
             arginfo::CC.ArgInfo, si::CC.StmtInfo, vtypes::Union{CC.VarTable,Nothing},
             sv::CC.InferenceState, max_methods::Int)
+        rt_override = tfunc(f, arginfo.argtypes)
         subprog = _infer_subprogram(interp, f, arginfo, si, vtypes, sv)
-        subprog === nothing && return result
+        rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
         push!(sv.tasks, function (interp′, sv′)
             isready(result) || return false
-            isready(subprog) || return false
+            subprog !== nothing && !isready(subprog) && return false
             cm = result[]
-            sp = subprog[]
-            wrapped[] = CC.CallMeta(cm.rt, cm.exct, cm.effects,
-                                    SubprogramCallInfo(cm.info, sp.info), cm.refinements)
+            sp = subprog !== nothing ? subprog[] : nothing
+            rt = rt_override !== nothing ? rt_override : cm.rt
+            info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info
+            wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements)
             return true
         end)
         return wrapped
@@ -182,16 +195,18 @@ elseif isdefined(CC, :Future)   # 1.12–1.13
         result = @invoke CC.abstract_call_known(interp::CC.AbstractInterpreter, f::Any,
             arginfo::CC.ArgInfo, si::CC.StmtInfo,
             sv::CC.InferenceState, max_methods::Int)
+        rt_override = tfunc(f, arginfo.argtypes)
         subprog = _infer_subprogram(interp, f, arginfo, si, nothing, sv)
-        subprog === nothing && return result
+        rt_override === nothing && subprog === nothing && return result
         wrapped = CC.Future{CC.CallMeta}()
         push!(sv.tasks, function (interp′, sv′)
             isready(result) || return false
-            isready(subprog) || return false
+            subprog !== nothing && !isready(subprog) && return false
             cm = result[]
-            sp = subprog[]
-            wrapped[] = CC.CallMeta(cm.rt, cm.exct, cm.effects,
-                                    SubprogramCallInfo(cm.info, sp.info), cm.refinements)
+            sp = subprog !== nothing ? subprog[] : nothing
+            rt = rt_override !== nothing ? rt_override : cm.rt
+            info = sp !== nothing ? SubprogramCallInfo(cm.info, sp.info) : cm.info
+            wrapped[] = CC.CallMeta(rt, cm.exct, cm.effects, info, cm.refinements)
             return true
         end)
         return wrapped
@@ -204,6 +219,11 @@ else   # 1.11: synchronous, edges auto-tracked via stmt_edges
             arginfo::CC.ArgInfo, si::CC.StmtInfo,
             sv::CC.AbsIntState, max_methods::Int)
         _infer_subprogram(interp, f, arginfo, si, nothing, sv)  # side-effect only
+        rt_override = tfunc(f, arginfo.argtypes)
+        if rt_override !== nothing
+            return CC.CallMeta(rt_override, result.exct, result.effects,
+                               result.info)
+        end
         return result
     end
 end
diff --git a/src/compiler/intrinsics.jl b/src/compiler/intrinsics.jl
index 9beed19..aa0d425 100644
--- a/src/compiler/intrinsics.jl
+++ b/src/compiler/intrinsics.jl
@@ -19,6 +19,15 @@ end
 #       Sometimes that's not possible, e.g., because the functionality required for that is
 #       overlayed by methods calling back into the intrinsic (e.g. `sin`), so for those
 #       intrinsics we disable constant folding using a `compilerbarrier(:const)`
+#
+# NOTE: Side-effectful intrinsics (stores, atomics) use `donotdelete(args...)` in their
+#       bodies to prevent the optimizer from DCE'ing calls. `donotdelete` is a Julia builtin
+#       with `effect_free=ALWAYS_FALSE`, which inference propagates through the function body.
+#       `@assume_effects !:effect_free` does NOT work — `override_effects` can only strengthen
+#       effects (set ALWAYS_TRUE), not weaken them. Spoofing `ipo_effects` via a custom
+#       `CC.finish!` override is possible but fragile (must race against `finishinfer!` setting
+#       `use_const_api` based on pre-override effects). `donotdelete` is the simplest correct
+#       approach.
 
 emit_intrinsic!(ctx::CGCtx, @nospecialize(func), args) = missing
 
diff --git a/src/compiler/intrinsics/arithmetic.jl b/src/compiler/intrinsics/arithmetic.jl
index 6366910..6272251 100644
--- a/src/compiler/intrinsics/arithmetic.jl
+++ b/src/compiler/intrinsics/arithmetic.jl
@@ -97,7 +97,7 @@ end
 # cuda_tile.addi
 @eval Intrinsics begin
     @noinline addi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.add_int(x, y)
-    @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline addi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addi), args)
     emit_binop!(ctx, args, encode_AddIOp!)
@@ -105,7 +105,7 @@ end
 
 # cuda_tile.cldi (ceiling division, toward positive infinity)
 @eval Intrinsics begin
-    @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = (donotdelete(x, y, s); compilerbarrier(:const, zero(T)))
+    @noinline cldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T))
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("cldi requires compile-time signedness"))
@@ -130,7 +130,7 @@ end
         end
     end
     @noinline cmpi(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate, ::Signedness) where {T<:Integer, S} =
-        (donotdelete(a, b); Tile{Bool, S}())
+        Tile{Bool, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpi), args)
     cb = ctx.cb
@@ -168,7 +168,7 @@ end
 
 # cuda_tile.fldi (floor division, toward negative infinity)
 @eval Intrinsics begin
-    @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = (donotdelete(x, y, s); compilerbarrier(:const, zero(T)))
+    @noinline fldi(x::T, y::T, s::Signedness) where {T<:Integer} = compilerbarrier(:const, zero(T))
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fldi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("fldi requires compile-time signedness"))
@@ -182,7 +182,7 @@ end
         ifelse(lt, y, x)
     end
     @noinline maxi(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} =
-        (donotdelete(a, b); Tile{T, S}())
+        Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxi), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("maxi requires compile-time signedness"))
@@ -196,7 +196,7 @@ end
         ifelse(lt, x, y)
     end
     @noinline mini(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} =
-        (donotdelete(a, b); Tile{T, S}())
+        Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mini), args)
     signedness = @something get_constant(ctx, args[3]) throw(IRError("mini requires compile-time signedness"))
@@ -206,7 +206,7 @@ end
 # cuda_tile.muli
 @eval Intrinsics begin
     @noinline muli(x::T, y::T) where {T<:Integer} = Core.Intrinsics.mul_int(x, y)
-    @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline muli(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.muli), args)
     emit_binop!(ctx, args, encode_MulIOp!)
@@ -218,7 +218,7 @@ end
     @noinline function mulhii(x::T, y::T, s::Signedness) where {T<:Integer}
         ((widen(x) * widen(y)) >>> (8 * sizeof(T))) % T
     end
-    @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline mulhii(a::Tile{T, S}, b::Tile{T, S}, s::Signedness) where {T<:Integer, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulhii), args)
     emit_binop!(ctx, args, encode_MulhiIOp!)
@@ -266,7 +266,7 @@ end
 # cuda_tile.subi
 @eval Intrinsics begin
     @noinline subi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.sub_int(x, y)
-    @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline subi(a::Tile{T, S}, b::Tile{T, S}) where {T<:Integer, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subi), args)
     emit_binop!(ctx, args, encode_SubIOp!)
@@ -287,7 +287,7 @@ end
 # cuda_tile.addf
 @eval Intrinsics begin
     @noinline addf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.add_float(x, y)
-    @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline addf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.addf), args)
     emit_binop!(ctx, args, encode_AddFOp!)
@@ -311,7 +311,7 @@ end
         end
     end
     @noinline cmpf(a::Tile{T, S}, b::Tile{T, S}, ::ComparisonPredicate) where {T<:AbstractFloat, S} =
-        (donotdelete(a, b); Tile{Bool, S}())
+        Tile{Bool, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cmpf), args)
     cb = ctx.cb
@@ -338,7 +338,7 @@ end
 # cuda_tile.divf
 @eval Intrinsics begin
     @noinline divf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.div_float(x, y)
-    @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline divf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.divf), args)
     emit_binop!(ctx, args, encode_DivFOp!)
@@ -347,7 +347,7 @@ end
 # cuda_tile.mulf
 @eval Intrinsics begin
     @noinline mulf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.mul_float(x, y)
-    @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline mulf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.mulf), args)
     emit_binop!(ctx, args, encode_MulFOp!)
@@ -365,7 +365,7 @@ end
 # cuda_tile.subf
 @eval Intrinsics begin
     @noinline subf(x::T, y::T) where {T<:AbstractFloat} = Core.Intrinsics.sub_float(x, y)
-    @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline subf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.subf), args)
     emit_binop!(ctx, args, encode_SubFOp!)
@@ -378,7 +378,7 @@ end
 @eval Intrinsics begin
     @noinline andi(x::T, y::T) where {T<:Integer} = Core.Intrinsics.and_int(x, y)
     """Element-wise logical AND for boolean tiles."""
-    @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = (donotdelete(a, b); Tile{Bool, S}())
+    @noinline andi(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.andi), args)
     cb = ctx.cb
@@ -399,7 +399,7 @@ end
 @eval Intrinsics begin
     @noinline ori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.or_int(x, y)
     """Element-wise logical OR for boolean tiles."""
-    @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = (donotdelete(a, b); Tile{Bool, S}())
+    @noinline ori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.ori), args)
     cb = ctx.cb
@@ -420,7 +420,7 @@ end
 @eval Intrinsics begin
     @noinline xori(x::T, y::T) where {T<:Integer} = Core.Intrinsics.xor_int(x, y)
     """Element-wise logical XOR for boolean tiles."""
-    @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = (donotdelete(a, b); Tile{Bool, S}())
+    @noinline xori(a::Tile{Bool, S}, b::Tile{Bool, S}) where {S} = Tile{Bool, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.xori), args)
     cb = ctx.cb
diff --git a/src/compiler/intrinsics/atomics.jl b/src/compiler/intrinsics/atomics.jl
index e31d966..3c89bd4 100644
--- a/src/compiler/intrinsics/atomics.jl
+++ b/src/compiler/intrinsics/atomics.jl
@@ -41,7 +41,7 @@ end
     """
     @noinline function atomic_cas(array::TileArray{T, N}, index, expected, desired,
                                    memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete(array, index, expected, desired)
+        donotdelete()
         compilerbarrier(:const, zero(T))::T
     end
 end
@@ -179,7 +179,7 @@ end
     """
     @noinline function atomic_xchg(array::TileArray{T, N}, index, val,
                                     memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete(array, index, val)
+        donotdelete()
         compilerbarrier(:const, zero(T))
     end
 end
@@ -198,7 +198,7 @@ end
     """
     @noinline function atomic_add(array::TileArray{T, N}, index, val,
                                    memory_order::Int, memory_scope::Int) where {T, N}
-        donotdelete(array, index, val)
+        donotdelete()
         compilerbarrier(:const, zero(T))
     end
 end
diff --git a/src/compiler/intrinsics/conversions.jl b/src/compiler/intrinsics/conversions.jl
index 197e13b..5b522c4 100644
--- a/src/compiler/intrinsics/conversions.jl
+++ b/src/compiler/intrinsics/conversions.jl
@@ -10,7 +10,6 @@
     cuda_tile.exti, or cuda_tile.trunci based on source/target types.
     """
     @noinline function astype(tile::Tile{T1, Shape}, ::Type{T2}) where {T1, Shape, T2}
-        donotdelete(tile)
         Tile{T2, Shape}()
     end
 end
diff --git a/src/compiler/intrinsics/core.jl b/src/compiler/intrinsics/core.jl
index 25b6c34..bcb3eca 100644
--- a/src/compiler/intrinsics/core.jl
+++ b/src/compiler/intrinsics/core.jl
@@ -26,10 +26,20 @@ end
     Explicitly broadcast a tile to a target shape.
     Compiled to cuda_tile.broadcast.
     """
-    @noinline function broadcast(tile::Tile{T}, ::Val{Shape}) where {T, Shape}
-        Tile{T, Tuple{Shape...}}()
+    @noinline function broadcast(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N}
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.broadcast), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    tile_type = CC.widenconst(argtypes[2])
+    tile_type <: Tile || return nothing
+    shape_arg = argtypes[3]
+    isa(shape_arg, CC.Const) || return nothing
+    shape = shape_arg.val
+    T = eltype(tile_type)
+    return Tile{T, Tuple{shape...}}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.broadcast), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -42,7 +52,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.broadcast), args)
     source_type = CC.widenconst(source.jltype)
     source_elem = eltype(source_type)
 
-    # Extract target shape from the constant tuple argument
+    # Extract target shape
     target_shape_tuple = get_constant(ctx, args[2])
     target_shape_tuple isa Tuple || throw(IRError("broadcast() shape must be a compile-time constant tuple"))
     target_shape = collect(Int, target_shape_tuple)
@@ -106,19 +116,28 @@ end
     Concatenate two tiles along 0-indexed axis.
     Compiled to cuda_tile.cat.
     """
-    @noinline function cat(tiles::Tuple{Tile{T}, Tile{T}}, ::Val{Axis}) where {T, Axis}
-        t1, t2 = tiles
-        n = ndims(t1)
-        axis = Axis < 0 ? n + Axis : Axis
-        result_shape = ntuple(n) do i
-            if i == axis + 1  # 0-indexed axis, 1-indexed tuple access
-                size(t1, i) + size(t2, i)
-            else
-                size(t1, i)
-            end
-        end
-        Tile{T, Tuple{result_shape...}}()
-    end
+    @noinline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Integer) where {T, S1, S2}
+        compilerbarrier(:type, nothing)
+    end
+end
+function tfunc(::typeof(Intrinsics.cat), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    tuple_type = CC.widenconst(argtypes[2])
+    tuple_type <: Tuple{Tile, Tile} || return nothing
+    axis_arg = argtypes[3]
+    isa(axis_arg, CC.Const) || return nothing
+    axis = axis_arg.val
+    t1_type = tuple_type.parameters[1]
+    t2_type = tuple_type.parameters[2]
+    (t1_type <: Tile && t2_type <: Tile) || return nothing
+    T = eltype(t1_type)
+    s1 = size(t1_type)
+    s2 = size(t2_type)
+    isempty(s1) && return nothing
+    n = length(s1)
+    a = axis < 0 ? n + axis : axis
+    result_shape = ntuple(i -> i == a + 1 ? s1[i] + s2[i] : s1[i], n)
+    return Tile{T, Tuple{result_shape...}}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args)
     cb = ctx.cb
@@ -137,7 +156,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.cat), args)
     rhs = emit_value!(ctx, tuple_tv.tuple[2])
     (lhs === nothing || rhs === nothing) && throw(IRError("Cannot resolve tile operands for cat()"))
 
-    # Get axis from Val{Axis}
+    # Get axis
     axis_val = get_constant(ctx, args[2])
     axis_val isa Integer || throw(IRError("cat() axis must be a compile-time constant integer"))
 
@@ -175,9 +194,19 @@ end
     Compiled to cuda_tile.constant.
     """
     @noinline function constant(shape::NTuple{N, Int}, value, ::Type{T}) where {N, T}
-        Tile{T, Tuple{shape...}}()
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.constant), argtypes::Vector{Any})
+    length(argtypes) >= 4 || return nothing
+    shape_arg = argtypes[2]
+    isa(shape_arg, CC.Const) || return nothing
+    shape = shape_arg.val
+    type_arg = CC.widenconst(argtypes[4])
+    type_arg <: Type || return nothing
+    T = type_arg.parameters[1]
+    return Tile{T, Tuple{shape...}}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.constant), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -214,10 +243,20 @@ end
     Extract a sub-tile from tile at 0-indexed slice indices.
     Compiled to cuda_tile.extract.
     """
-    @noinline function extract(tile::Tile{T}, ::Val{Index}, ::Val{Shape}) where {T, Index, Shape}
-        Tile{T, Tuple{Shape...}}()
+    @noinline function extract(tile::Tile{T}, index::NTuple{N, Int}, shape::NTuple{N, Int}) where {T, N}
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.extract), argtypes::Vector{Any})
+    length(argtypes) >= 4 || return nothing
+    tile_type = CC.widenconst(argtypes[2])
+    tile_type <: Tile || return nothing
+    shape_arg = argtypes[4]
+    isa(shape_arg, CC.Const) || return nothing
+    shape = shape_arg.val
+    T = eltype(tile_type)
+    return Tile{T, Tuple{shape...}}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.extract), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -226,11 +265,11 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.extract), args)
     source = emit_value!(ctx, args[1])
     source === nothing && throw(IRError("Cannot resolve source operand for extract()"))
 
-    # Extract index from Val{Index} argument
+    # Extract index
     index_tuple = get_constant(ctx, args[2])
     index_tuple isa Tuple || throw(IRError("extract() index must be a compile-time constant tuple"))
 
-    # Extract shape from Val{Shape} argument
+    # Extract shape
     shape_tuple = get_constant(ctx, args[3])
     shape_tuple isa Tuple || throw(IRError("extract() shape must be a compile-time constant tuple"))
     output_shape = collect(Int, shape_tuple)
@@ -312,9 +351,19 @@ end
     Compiled to cuda_tile.iota.
     """
     @noinline function iota(shape::NTuple{1, Int}, ::Type{T}) where {T}
-        Tile{T, Tuple{shape...}}()
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.iota), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    shape_arg = argtypes[2]
+    isa(shape_arg, CC.Const) || return nothing
+    shape = shape_arg.val
+    type_arg = CC.widenconst(argtypes[3])
+    type_arg <: Type || return nothing
+    T = type_arg.parameters[1]
+    return Tile{T, Tuple{shape...}}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.iota), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -427,11 +476,22 @@ end
     Permute tile dimensions according to 0-indexed permutation.
     Compiled to cuda_tile.permute.
     """
-    @noinline function permute(tile::Tile{T}, ::Val{Perm}) where {T, Perm}
-        # Compute permuted shape: for each position i in output, take size(tile, Perm[i]+1)
-        permuted_shape = ntuple(i -> size(tile, Perm[i] + 1), ndims(tile))
-        Tile{T, Tuple{permuted_shape...}}()
-    end
+    @noinline function permute(tile::Tile{T, S}, perm::NTuple{N, Int}) where {T, S, N}
+        compilerbarrier(:type, nothing)
+    end
+end
+function tfunc(::typeof(Intrinsics.permute), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    tile_type = CC.widenconst(argtypes[2])
+    tile_type <: Tile || return nothing
+    perm_arg = argtypes[3]
+    isa(perm_arg, CC.Const) || return nothing
+    perm = perm_arg.val
+    s = size(tile_type)
+    isempty(s) && return nothing
+    T = eltype(tile_type)
+    permuted_shape = ntuple(i -> s[perm[i] + 1], length(perm))
+    return Tile{T, Tuple{permuted_shape...}}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args)
     cb = ctx.cb
@@ -444,7 +504,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.permute), args)
     input_shape = source.shape
     isempty(input_shape) && throw(IRError("Cannot determine tile shape for permute()"))
 
-    # Extract permutation from Val{Perm} argument
+    # Extract permutation
     perm_tuple = get_constant(ctx, args[2])
     perm_tuple isa Tuple || throw(IRError("permute() permutation must be a compile-time constant tuple"))
 
@@ -477,9 +537,18 @@ end
     Compiled to cuda_tile.permute with perm=(1, 0).
     """
     @noinline function transpose(tile::Tile{T}) where {T}
-        Tile{T, Tuple{reverse(size(tile))...}}()
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.transpose), argtypes::Vector{Any})
+    length(argtypes) >= 2 || return nothing
+    tile_type = CC.widenconst(argtypes[2])
+    tile_type <: Tile || return nothing
+    s = size(tile_type)
+    isempty(s) && return nothing
+    T = eltype(tile_type)
+    return Tile{T, Tuple{reverse(s)...}}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.transpose), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -516,18 +585,32 @@ end
     callers wrap in 1-tuples and unwrap with `[1]`.
     Compiled to cuda_tile.reduce.
     """
-    @noinline function reduce(tiles::Tuple{Tile{T}}, ::Val{axis}, f,
-                              identities::Tuple{Any}) where {T, axis}
-        tile = tiles[1]
-        reduced_shape = ntuple(i -> i == axis + 1 ? 1 : size(tile, i), ndims(tile))
-        (Tile{T, Tuple{reduced_shape...}}(),)
-    end
-    @noinline function reduce(tiles::Tuple{Tile{T}, Tile, Vararg{Tile}}, ::Val{axis}, f,
-                              identities::Tuple{Any, Any, Vararg{Any}}) where {T, axis}
-        tile = tiles[1]
-        reduced_shape = ntuple(i -> i == axis + 1 ? 1 : size(tile, i), ndims(tile))
-        (Tile{T, Tuple{reduced_shape...}}(), reduce(Base.tail(tiles), Val(axis), f, Base.tail(identities))...)
-    end
+    @noinline function reduce(tiles::Tuple{Tile{T, S}}, axis::Integer, f,
+                              identities::Tuple{Any}) where {T, S}
+        compilerbarrier(:type, nothing)
+    end
+    @noinline function reduce(tiles::Tuple{Tile{T1, S}, Tile{T2, S}}, axis::Integer, f,
+                              identities::Tuple{Any, Any}) where {T1, T2, S}
+        compilerbarrier(:type, nothing)
+    end
+end
+function tfunc(::typeof(Intrinsics.reduce), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    tuple_type = CC.widenconst(argtypes[2])
+    tuple_type isa DataType && tuple_type <: Tuple || return nothing
+    axis_arg = argtypes[3]
+    isa(axis_arg, CC.Const) || return nothing
+    axis = axis_arg.val
+    result_params = Any[]
+    for p in tuple_type.parameters
+        p isa DataType && p <: Tile || return nothing
+        T = eltype(p)
+        s = size(p)
+        isempty(s) && return nothing
+        reduced_shape = ntuple(i -> i == axis + 1 ? 1 : s[i], length(s))
+        push!(result_params, Tile{T, Tuple{reduced_shape...}})
+    end
+    return Tuple{result_params...}
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reduce), args)
     emit_reduce!(ctx, args)
@@ -547,7 +630,7 @@ function emit_reduce!(ctx::CGCtx, args)
     N = length(tile_tvs)
 
     # Get reduction axis
-    axis = @something get_constant(ctx, args[2]) throw(IRError("Reduction axis must be a compile-time constant"))
+    axis = get_constant(ctx, args[2])
 
     # Resolve combiner function
     func = get_constant(ctx, args[3])
@@ -648,10 +731,20 @@ make_identity_val(val, dtype, ::Type{T}) where T <: Integer =
     Reshape a tile to a new shape (same total elements).
     Compiled to cuda_tile.reshape.
     """
-    @noinline function reshape(tile::Tile{T}, ::Val{Shape}) where {T, Shape}
-        Tile{T, Tuple{Shape...}}()
+    @noinline function reshape(tile::Tile{T}, shape::NTuple{N, Int}) where {T, N}
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.reshape), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    tile_type = CC.widenconst(argtypes[2])
+    tile_type <: Tile || return nothing
+    shape_arg = argtypes[3]
+    isa(shape_arg, CC.Const) || return nothing
+    shape = shape_arg.val
+    T = eltype(tile_type)
+    return Tile{T, Tuple{shape...}}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -660,7 +753,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.reshape), args)
     source = emit_value!(ctx, args[1])
     source === nothing && throw(IRError("Cannot resolve source operand for reshape()"))
 
-    # Extract target shape from Val{Shape} argument
+    # Extract target shape
     target_shape_tuple = get_constant(ctx, args[2])
     target_shape_tuple isa Tuple || throw(IRError("reshape() shape must be a compile-time constant tuple"))
     target_shape = collect(Int, target_shape_tuple)
@@ -720,11 +813,22 @@ end
     `reverse=true` for a reverse (suffix) scan.
     Compiled to cuda_tile.scan.
     """
-    @noinline function scan(tiles::Tuple{Tile{T, S}}, ::Val{axis}, f,
-                            identities::Tuple{Any}, reverse::Bool=false) where {T, S, axis}
-        (Tile{T, S}(),)
+    @noinline function scan(tiles::Tuple{Tile{T, S}}, axis::Integer, f,
+                            identities::Tuple{Any}, reverse::Bool=false) where {T, S}
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.scan), argtypes::Vector{Any})
+    length(argtypes) >= 2 || return nothing
+    tuple_type = CC.widenconst(argtypes[2])
+    tuple_type isa DataType && tuple_type <: Tuple || return nothing
+    result_params = Any[]
+    for p in tuple_type.parameters
+        p isa DataType && p <: Tile || return nothing
+        push!(result_params, p)
+    end
+    return Tuple{result_params...}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -740,7 +844,7 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.scan), args)
     N = length(tile_tvs)
 
     # Get scan axis
-    axis = @something get_constant(ctx, args[2]) throw(IRError("Scan axis must be a compile-time constant"))
+    axis = get_constant(ctx, args[2])
 
     # Resolve combiner function
     func = get_constant(ctx, args[3])
@@ -844,11 +948,17 @@ end
 # to_scalar: jltype becomes scalar T (for overlay dispatch), but IR value stays shaped.
 # from_scalar: restores jltype to Tile{T, S}.
 @eval Intrinsics begin
-    @noinline to_scalar(tile::Tile{T, S}) where {T, S} = (donotdelete(tile); compilerbarrier(:const, T(0)))
-    # S is a tuple TYPE (e.g., Tuple{16}) passed through from the input tile
-    @noinline from_scalar(x::T, ::Val{S}) where {T, S} = (donotdelete(x); Tile{T, S}())
+    @noinline to_scalar(tile::Tile{T, S}) where {T, S} = compilerbarrier(:const, T(0))
+    @noinline from_scalar(x::T, ::Type{S}) where {T, S} = Tile{T, S}()
+end
+function tfunc(::typeof(Intrinsics.from_scalar), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    T = CC.widenconst(argtypes[2])
+    shape_type = CC.widenconst(argtypes[3])
+    shape_type <: Type || return nothing
+    S = shape_type.parameters[1]
+    return Tile{T, S}
 end
-
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.to_scalar), args)
     tv = emit_value!(ctx, args[1])
     tv === nothing && throw(IRError("Cannot resolve tile for to_scalar"))
@@ -861,9 +971,9 @@ end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.from_scalar), args)
     tv = emit_value!(ctx, args[1])
     tv === nothing && throw(IRError("Cannot resolve scalar for from_scalar"))
-    shape_val = @something get_constant(ctx, args[2]) throw(IRError("from_scalar shape must be constant"))
+    shape_type = get_constant(ctx, args[2])
     T = CC.widenconst(tv.jltype)
-    CGVal(tv.v, tv.type_id, Tile{T, shape_val}, tv.shape, nothing, nothing, nothing)
+    CGVal(tv.v, tv.type_id, Tile{T, shape_type}, tv.shape, nothing, nothing, nothing)
 end
 
 # TODO: cuda_tile.unpack
diff --git a/src/compiler/intrinsics/math.jl b/src/compiler/intrinsics/math.jl
index f2572d1..ded13df 100644
--- a/src/compiler/intrinsics/math.jl
+++ b/src/compiler/intrinsics/math.jl
@@ -81,8 +81,8 @@ end
 # cuda_tile.fma
 @eval Intrinsics begin
     """Fused multiply-add: a * b + c. Compiled to cuda_tile.fma."""
-    @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = (donotdelete(y, z); compilerbarrier(:const, x))
-    @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b, c); Tile{T, S}())
+    @noinline fma(x::T, y::T, z::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
+    @noinline fma(a::Tile{T, S}, b::Tile{T, S}, c::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.fma), args)
     cb = ctx.cb
@@ -135,7 +135,7 @@ end
 # cuda_tile.maxf
 @eval Intrinsics begin
     @noinline maxf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x > y || isnan(x), x, y)
-    @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline maxf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.maxf), args)
     emit_binop!(ctx, args, encode_MaxFOp!)
@@ -144,7 +144,7 @@ end
 # cuda_tile.minf
 @eval Intrinsics begin
     @noinline minf(x::T, y::T) where {T<:AbstractFloat} = ifelse(x < y || isnan(x), x, y)
-    @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline minf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.minf), args)
     emit_binop!(ctx, args, encode_MinFOp!)
@@ -153,8 +153,8 @@ end
 # cuda_tile.pow
 @eval Intrinsics begin
     """Element-wise power. Compiled to cuda_tile.pow."""
-    @noinline pow(x::T, y::T) where {T<:AbstractFloat} = (donotdelete(x, y); compilerbarrier(:const, x))
-    @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline pow(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
+    @noinline pow(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.pow), args)
     emit_binop!(ctx, args, encode_PowOp!)
@@ -164,7 +164,7 @@ end
 @eval Intrinsics begin
     """Element-wise floating-point remainder. Compiled to cuda_tile.remf."""
     @noinline remf(x::T, y::T) where {T<:AbstractFloat} = compilerbarrier(:const, x)
-    @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = (donotdelete(a, b); Tile{T, S}())
+    @noinline remf(a::Tile{T, S}, b::Tile{T, S}) where {T<:AbstractFloat, S} = Tile{T, S}()
 end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.remf), args)
     emit_binop!(ctx, args, encode_RemFOp!)
diff --git a/src/compiler/intrinsics/memory.jl b/src/compiler/intrinsics/memory.jl
index 3bae03d..1d42ad5 100644
--- a/src/compiler/intrinsics/memory.jl
+++ b/src/compiler/intrinsics/memory.jl
@@ -18,7 +18,6 @@
                                      latency::Union{Int, Nothing}=nothing,
                                      mask::Union{Tile{Bool, S}, Nothing}=nothing,
                                      padding::Union{Tile{T, S}, Nothing}=nothing) where {T, S}
-        donotdelete(ptrs, latency, mask, padding)
         Tile{T, S}()
     end
 end
@@ -96,7 +95,7 @@ end
     @noinline function store_ptr_tko(ptrs::Tile{Ptr{T}, S}, values::Tile{T, S},
                                       latency::Union{Int, Nothing},
                                       mask::Union{Tile{Bool, S}, Nothing}=nothing) where {T, S}
-        donotdelete(ptrs, values, latency, mask)
+        donotdelete()
         nothing
     end
 end
diff --git a/src/compiler/intrinsics/views.jl b/src/compiler/intrinsics/views.jl
index 9fc00b3..c8f1a88 100644
--- a/src/compiler/intrinsics/views.jl
+++ b/src/compiler/intrinsics/views.jl
@@ -32,7 +32,6 @@ end
     Compiled to cuda_tile.get_index_space_shape.
     """
     @noinline function get_index_space_shape(pv::PartitionView{T, N, Shape}, axis::Integer) where {T, N, Shape}
-        donotdelete(pv)
         compilerbarrier(:const, zero(Int32))
     end
 end
@@ -81,11 +80,20 @@ end
                                             latency::Union{Int, Nothing},
                                             allow_tma::Bool,
                                             indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-        donotdelete(pv, latency, allow_tma)
-        # Shape is already a tuple TYPE (e.g., Tuple{64}) from make_partition_view
-        Tile{T, Shape}()
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.load_partition_view), argtypes::Vector{Any})
+    length(argtypes) >= 2 || return nothing
+    pv_type = CC.widenconst(argtypes[2])
+    pv_type <: PartitionView || return nothing
+    pv_type isa DataType || return nothing
+    length(pv_type.parameters) >= 3 || return nothing
+    T = eltype(pv_type)
+    Shape = pv_type.parameters[3]
+    Shape isa Type || return nothing
+    return Tile{T, Shape}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_partition_view), args)
     cb = ctx.cb
     tt = ctx.tt
@@ -169,21 +177,31 @@ end
         make_partition_view(tv::TensorView, shape_val, padding_mode, order) -> PartitionView
 
     Create a PartitionView from a TensorView with the given tile shape.
-    The `order` parameter (Val{NTuple{N,Int}} or Val{nothing}) specifies
+    The `order` parameter (NTuple{N,Int} or nothing) specifies
     the logical-to-physical dimension mapping (1-indexed), or identity if nothing.
     Compiled to cuda_tile.make_partition_view.
     """
-    @noinline function make_partition_view(tv::TensorView{T, N}, ::Val{Shape}, padding_mode::Int, ::Val{Order}) where {T, N, Shape, Order}
-        donotdelete(tv)
-        PartitionView{T, N, Tuple{Shape...}}()
+    @noinline function make_partition_view(tv::TensorView{T, N}, shape::NTuple{M, Int}, padding_mode::Int, order) where {T, N, M}
+        compilerbarrier(:type, nothing)
     end
 end
+function tfunc(::typeof(Intrinsics.make_partition_view), argtypes::Vector{Any})
+    length(argtypes) >= 3 || return nothing
+    tv_type = CC.widenconst(argtypes[2])
+    tv_type <: TensorView || return nothing
+    shape_arg = argtypes[3]
+    isa(shape_arg, CC.Const) || return nothing
+    shape = shape_arg.val
+    T = eltype(tv_type)
+    N = ndims(tv_type)
+    return PartitionView{T, N, Tuple{shape...}}
+end
 function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_partition_view), args)
     tv = emit_value!(ctx, args[1])
     tv === nothing && throw(IRError("make_partition_view() requires a TensorView argument"))
 
-    # User boundary: Val{Shape} contains VALUE tuple from user call (e.g., load(arr, idx, (16,)))
-    shape = @something get_constant(ctx, args[2]) throw(IRError("make_partition_view() shape must be a compile-time constant"))
+    # Shape from user call (e.g., load(arr, idx, (16,)))
+    shape = get_constant(ctx, args[2])
     shape isa Tuple || throw(IRError("make_partition_view() shape must be a tuple, got $(typeof(shape))"))
     tile_shape = collect(Int, shape)
     validate_tile_shape(tile_shape, "load")
@@ -195,11 +213,9 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.make_partition_view), a
     elem_type = eltype(tv.jltype)
     ndim = length(tile_shape)
 
-    # Extract order from Val{Order} (arg 4)
-    # Val{nothing} → identity dim_map, Val{(2,1)} → [1, 0] (1-indexed → 0-indexed)
-    order_arg = emit_value!(ctx, args[4])
-    order_jltype = CC.widenconst(order_arg.jltype)
-    order_val = order_jltype.parameters[1]  # nothing or NTuple{N,Int}
+    # Extract order (arg 4)
+    # nothing → identity dim_map, (2,1) → [1, 0] (1-indexed → 0-indexed)
+    order_val = get_constant(ctx, args[4])
     if order_val === nothing
         dim_map = collect(0:ndim-1)
     else
@@ -328,7 +344,6 @@ end
     Compiled to cuda_tile.make_tensor_view.
     """
     @noinline function make_tensor_view(arr::TileArray{T, N})::TensorView{T, N} where {T, N}
-        donotdelete(arr)
         TensorView{T, N}()
     end
 end
@@ -359,11 +374,11 @@ end
     Compiled to cuda_tile.store_view_tko.
     """
     @noinline function store_partition_view(pv::PartitionView{T, N, Shape},
-                                             tile::Tile{T, Shape},
+                                             tile::Tile{T},
                                              latency::Union{Int, Nothing},
                                              allow_tma::Bool,
                                              indices::NTuple{M, <:Integer}) where {T, N, Shape, M}
-        donotdelete(pv, tile, latency, allow_tma)
+        donotdelete()
         nothing
     end
 end
diff --git a/src/language/operations.jl b/src/language/operations.jl
index b368db4..0d49a7e 100644
--- a/src/language/operations.jl
+++ b/src/language/operations.jl
@@ -61,7 +61,7 @@ Axis is 1-indexed. Equivalent to cld(arr.sizes[axis], shape[axis]).
 """
 @inline function num_tiles(arr::TileArray, axis::Integer, shape::NTuple{<:Any, Int})
     tv = Intrinsics.make_tensor_view(arr)
-    pv = Intrinsics.make_partition_view(tv, Val(shape), PaddingMode.Undetermined, Val(nothing))
+    pv = Intrinsics.make_partition_view(tv, shape, PaddingMode.Undetermined, nothing)
     Intrinsics.get_index_space_shape(pv, axis - One())  # convert to 0-indexed
 end
 
@@ -103,7 +103,7 @@ tile = ct.load(arr, (bidx, bidy), (TM, TN); order=(2, 1))
                       latency::Union{Int, Nothing}=nothing,
                       allow_tma::Bool=true)
     tv = Intrinsics.make_tensor_view(arr)
-    pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode, Val(order))
+    pv = Intrinsics.make_partition_view(tv, shape, padding_mode, order)
     Intrinsics.load_partition_view(pv, latency, allow_tma, promote(index...) .- One())
 end
 
@@ -113,7 +113,7 @@ end
                       latency::Union{Int, Nothing}=nothing,
                       allow_tma::Bool=true)
     tv = Intrinsics.make_tensor_view(arr)
-    pv = Intrinsics.make_partition_view(tv, Val(shape), padding_mode, Val(order))
+    pv = Intrinsics.make_partition_view(tv, shape, padding_mode, order)
     Intrinsics.load_partition_view(pv, latency, allow_tma, (index - One(),))
 end
 
@@ -125,7 +125,7 @@ end
                       allow_tma::Bool=true)
     shape_val = _extract_shape(shape)
     tv = Intrinsics.make_tensor_view(arr)
-    pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode, Val(order))
+    pv = Intrinsics.make_partition_view(tv, shape_val, padding_mode, order)
     Intrinsics.load_partition_view(pv, latency, allow_tma, promote(index...) .- One())
 end
 
@@ -137,7 +137,7 @@ end
                       allow_tma::Bool=true)
     shape_val = _extract_shape(shape)
     tv = Intrinsics.make_tensor_view(arr)
-    pv = Intrinsics.make_partition_view(tv, Val(shape_val), padding_mode, Val(order))
+    pv = Intrinsics.make_partition_view(tv, shape_val, padding_mode, order)
     Intrinsics.load_partition_view(pv, latency, allow_tma, promote(index...) .- One())
 end
 
@@ -184,7 +184,7 @@ Returns the stored tile (enables chaining and helps constant folding).
                        latency::Union{Int, Nothing}=nothing,
                        allow_tma::Bool=true) where {T}
     reshaped = _reshape_for_store(tile, Val(ndims(arr)))
-    _store_reshaped(arr, reshaped, Val(order), latency, allow_tma, promote(index...) .- One())
+    _store_reshaped(arr, reshaped, order, latency, allow_tma, promote(index...) .- One())
     return tile  # XXX: enables constant folding; remove when possible (see "constant folding" test)
 end
 
@@ -193,14 +193,14 @@ end
                        latency::Union{Int, Nothing}=nothing,
                        allow_tma::Bool=true) where {T}
     reshaped = _reshape_for_store(tile, Val(ndims(arr)))
-    _store_reshaped(arr, reshaped, Val(order), latency, allow_tma, (index - One(),))
+    _store_reshaped(arr, reshaped, order, latency, allow_tma, (index - One(),))
     return tile  # XXX: enables constant folding; remove when possible (see "constant folding" test)
 end
 
 @inline function _store_reshaped(arr::TileArray{T}, tile::Tile{T},
-                                 ::Val{Order}, latency, allow_tma, indices::NTuple{<:Any, <:Integer}) where {T, Order}
+                                 order, latency, allow_tma, indices::NTuple{<:Any, <:Integer}) where {T}
     tv = Intrinsics.make_tensor_view(arr)
-    pv = Intrinsics.make_partition_view(tv, Val(size(tile)), PaddingMode.Undetermined, Val(Order))
+    pv = Intrinsics.make_partition_view(tv, size(tile), PaddingMode.Undetermined, order)
     Intrinsics.store_partition_view(pv, tile, latency, allow_tma, indices)
 end
 
@@ -464,11 +464,11 @@ combined_last = ct.cat((tile_a, tile_b), -1)
 """
 @inline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, axis::Int) where {T, S1, S2}
     axis0 = axis < 0 ? axis : axis - 1
-    Intrinsics.cat(tiles, Val(axis0))
+    Intrinsics.cat(tiles, axis0)
 end
 @inline function cat(tiles::Tuple{Tile{T, S1}, Tile{T, S2}}, ::Val{Axis}) where {T, S1, S2, Axis}
     axis0 = Axis < 0 ? Axis : Axis - 1
-    Intrinsics.cat(tiles, Val(axis0))
+    Intrinsics.cat(tiles, axis0)
 end
 
 """
@@ -483,7 +483,7 @@ expanded = ct.broadcast_to(row, (64, 128))  # Shape (64, 128)
 ```
 """
 @inline broadcast_to(tile::Tile{T}, shape::NTuple{<:Any, Int}) where {T} =
-    Intrinsics.broadcast(tile, Val(shape))
+    Intrinsics.broadcast(tile, shape)
 
 """
     reshape(tile::Tile{T, S}, shape::NTuple{N, Int}) -> Tile{T, shape}
@@ -497,7 +497,7 @@ reshaped = reshape(tile, (2, 16))  # Shape (2, 16), still 32 elements
 ```
 """
 @inline Base.reshape(tile::Tile{T}, shape::NTuple{<:Any, Int}) where {T} =
-    Intrinsics.reshape(tile, Val(shape))
+    Intrinsics.reshape(tile, shape)
 
 """
     permutedims(tile::Tile{T, S}, perm) -> Tile{T, permuted_shape}
@@ -512,9 +512,9 @@ permuted = permutedims(tile, (3, 1, 2))    # Shape (4, 2, 3)
 ```
 """
 @inline Base.permutedims(tile::Tile{T}, perm::NTuple{<:Any, Int}) where {T} =
-    Intrinsics.permute(tile, Val(map(p -> p - 1, perm)))
+    Intrinsics.permute(tile, map(p -> p - 1, perm))
 @inline Base.permutedims(tile::Tile{T}, ::Val{Perm}) where {T, Perm} =
-    Intrinsics.permute(tile, Val(map(p -> p - 1, Perm)))
+    Intrinsics.permute(tile, map(p -> p - 1, Perm))
 
 """
     permutedims(tile::Tile{T, (M, N)}) -> Tile{T, (N, M)}
@@ -537,9 +537,9 @@ Differs from `transpose` in that the operation is not recursive.
     first_dim = n >= 1 ? size(T, 1) : nothing
 
     if n == 2
-        return :(Intrinsics.permute(tile, Val((1, 0))))
+        return :(Intrinsics.permute(tile, (1, 0)))
     elseif n == 1
-        return :(Intrinsics.reshape(tile, Val((1, $first_dim))))
+        return :(Intrinsics.reshape(tile, (1, $first_dim)))
     else
         return :(throw(ArgumentError("permutedims(tile) only works for 1D or 2D tiles")))
     end
@@ -592,7 +592,7 @@ result = map(+, a, b)            # Element-wise addition (same shape required)
 ```
 """
 @inline function Base.map(f, a::Tile{<:Any,S}, rest::Tile{<:Any,S}...) where {S}
-    Intrinsics.from_scalar(f(Intrinsics.to_scalar(a), map(Intrinsics.to_scalar, rest)...), Val(S))
+    Intrinsics.from_scalar(f(Intrinsics.to_scalar(a), map(Intrinsics.to_scalar, rest)...), S)
 end
 
 """
@@ -624,7 +624,7 @@ vals, idxs = mapreduce(identity, reducer, vals_tile, idx_tile;
 """
 @inline function Base.mapreduce(::typeof(identity), f, tile::Tile{T,S};
                                 dims::Integer, init) where {T<:Number, S}
-    Intrinsics.reduce((tile,), Val(dims - 1), f, (T(init),))[1]
+    Intrinsics.reduce((tile,), dims - 1, f, (T(init),))[1]
 end
 
 @inline function Base.mapreduce(f, op, tile::Tile{T,S};
@@ -641,7 +641,7 @@ end
     function _combiner(args...)
         f(_deinterleave_accs(args...), _deinterleave_elems(args...))
     end
-    Intrinsics.reduce(all_tiles, Val(dims - 1), _combiner, init)
+    Intrinsics.reduce(all_tiles, dims - 1, _combiner, init)
 end
 
 """
@@ -827,7 +827,7 @@ Supported functions: `+`, `*`, `max`, `min`.
 """
 @inline function Base.accumulate(f, tile::Tile{T,S}; dims::Integer,
                                  init, rev::Bool=false) where {T<:Number, S}
-    Intrinsics.scan((tile,), Val(dims - 1), f, (T(init),), rev)[1]
+    Intrinsics.scan((tile,), dims - 1, f, (T(init),), rev)[1]
 end
 
 """
@@ -921,6 +921,6 @@ br = ct.extract(tile, (2, 2), (4, 4))  # Bottom-right (rows 5-8, cols 5-8)
 ```
 """
 @inline extract(tile::Tile{T}, index::NTuple{<:Any, Int}, shape::NTuple{<:Any, Int}) where {T} =
-    Intrinsics.extract(tile, Val(map(i -> i - 1, index)), Val(shape))
+    Intrinsics.extract(tile, map(i -> i - 1, index), shape)
 @inline extract(tile::Tile{T}, ::Val{Index}, ::Val{Shape}) where {T, Index, Shape} =
-    Intrinsics.extract(tile, Val(map(i -> i - 1, Index)), Val(Shape))
+    Intrinsics.extract(tile, map(i -> i - 1, Index), Shape)
diff --git a/test/codegen.jl b/test/codegen.jl
index ff984cf..a27bbf3 100644
--- a/test/codegen.jl
+++ b/test/codegen.jl
@@ -343,13 +343,18 @@
         @testset "mixed-type integer comparison" begin
             @test @filecheck begin
                 @check_label "entry"
-                code_tiled(Tuple{}) do
+                code_tiled(Tuple{ct.TileArray{Int64,1,spec1d}}) do out
                     a = ct.arange((16,), Int64)
                     b = ct.arange((16,), Int32)
                     # Should promote Int32 to Int64 and compare
                     @check "exti"
                     @check "cmpi"
+                    @check "select"
                     result = a .< b
+                    # Use same-typed operands for where to avoid Union type
+                    b_promoted = ct.astype(b, Int64)
+                    selected = ct.where(result, a, b_promoted)
+                    ct.store(out, Int32(0), selected)
                     return
                 end
             end
@@ -1070,6 +1075,32 @@
             end
         end
 
+        @testset "power operations" begin
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    @check "pow"
+                    Base.donotdelete(tile .^ tile)
+                    return
+                end
+            end
+
+            # scalar exponent
+            @test @filecheck begin
+                @check_label "entry"
+                code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a
+                    pid = ct.bid(1)
+                    tile = ct.load(a, pid, (16,))
+                    @check "broadcast"
+                    @check "pow"
+                    Base.donotdelete(tile .^ 2.0f0)
+                    return
+                end
+            end
+        end
+
         @testset "scalar math functions" begin
             # Test scalar math functions via overlays (sin, exp, sqrt, etc. on scalars)
             # Note: We pass scalar args to avoid constant folding at compile time
@@ -1324,11 +1355,16 @@
             spec = ct.ArraySpec{2}(16, true)
             @test @filecheck begin
                 @check_label "entry"
-                code_tiled(Tuple{ct.TileArray{Float32,2,spec}}) do a
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec}, ct.TileArray{Float32,2,spec}}) do a, b
                     @check "make_tensor_view"
                     @check "make_partition_view"
                     @check "get_index_space_shape"
                     num = ct.num_tiles(a, 1, (32, 32))
+                    # Use num as a tile index to prevent DCE
+                    @check "load_view_tko"
+                    tile = ct.load(a, (num, Int32(0)), (32, 32))
+                    @check "store_view_tko"
+                    ct.store(b, (Int32(0), Int32(0)), tile)
                     return
                 end
             end
@@ -1894,6 +1930,20 @@ end
     @testset "method error detection" begin
         spec = ct.ArraySpec{1}(16, true)
 
+        isdefined(Core, :throw_methoderror) &&
+        @testset "mismatched tile shapes with + produces MethodError" begin
+            spec2d = ct.ArraySpec{2}(16, true)
+            @test_throws "MethodError during Tile IR compilation" begin
+                code_tiled(Tuple{ct.TileArray{Float32,2,spec2d}}) do a
+                    pid = ct.bid(1)
+                    tile_a = ct.load(a, pid, (4, 8))
+                    tile_b = ct.load(a, pid, (8, 4))
+                    Base.donotdelete(tile_a + tile_b)
+                    return
+                end
+            end
+        end
+
         isdefined(Core, :throw_methoderror) &&
         @testset "no matching method produces MethodError" begin
             only_ints(x::Int) = x
@@ -2020,6 +2070,69 @@ end
     end
 end
 
+#=============================================================================
+ External Constants (GlobalRef handling)
+=============================================================================#
+
+# Constants defined outside the kernel (module-level `const`) appear as GlobalRef
+# nodes in Julia IR. These must emit proper ConstantOp for numeric types,
+# not ghost values (which produce nothing in the bytecode).
+
+const _CODEGEN_TEST_FLOAT32 = Float32(1 / log(2))
+const _CODEGEN_TEST_FLOAT64 = 3.14159
+
+@testset "External Constants" begin
+    spec1d = ct.ArraySpec{1}(16, true)
+
+    @testset "external Float32 constant in arithmetic" begin
+        # Bug 1: GlobalRef for Float32 must emit ConstantOp, not a ghost value.
+        # Previously, emit_value!(ctx, ::GlobalRef) wrapped all values as ghosts,
+        # causing MulFOp to receive `nothing` instead of a bytecode Value.
+        @test @filecheck begin
+            @check_label "entry"
+            code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a
+                pid = ct.bid(1)
+                tile = ct.load(a, pid, (16,))
+                @check "constant <f32"
+                @check "mulf"
+                Base.donotdelete(tile * _CODEGEN_TEST_FLOAT32)
+                return
+            end
+        end
+    end
+
+    @testset "external Float64 constant in arithmetic" begin
+        @test @filecheck begin
+            @check_label "entry"
+            code_tiled(Tuple{ct.TileArray{Float64,1,spec1d}}) do a
+                pid = ct.bid(1)
+                tile = ct.load(a, pid, (16,))
+                @check "constant <f64"
+                @check "mulf"
+                Base.donotdelete(tile * _CODEGEN_TEST_FLOAT64)
+                return
+            end
+        end
+    end
+
+    @testset "external constant assigned to local variable" begin
+        # Bug 2: GlobalRef on RHS of assignment in emit_rhs! returned nothing.
+        # Using a local variable forces Julia to emit an assignment from the GlobalRef.
+        @test @filecheck begin
+            @check_label "entry"
+            code_tiled(Tuple{ct.TileArray{Float32,1,spec1d}}) do a
+                pid = ct.bid(1)
+                tile = ct.load(a, pid, (16,))
+                local_const = _CODEGEN_TEST_FLOAT32
+                @check "constant <f32"
+                @check "mulf"
+                Base.donotdelete(tile * local_const)
+                return
+            end
+        end
+    end
+end
+
 #=============================================================================
  Entry Hints (kernel-level optimization hints)
 =============================================================================#
diff --git a/test/types.jl b/test/types.jl
index 5832451..e46f135 100644
--- a/test/types.jl
+++ b/test/types.jl
@@ -46,112 +46,3 @@ end
     @test ndims(cuTile.TensorView{Float32, 2}) == 2
     @test ndims(cuTile.TensorView{Int32, 1}) == 1
 end
-
-@testset "mismatched shapes with + throws MethodError" begin
-    tile_a = ct.Tile{Float32, Tuple{1, 128}}()
-    tile_b = ct.Tile{Float32, Tuple{64, 1}}()
-
-    # + should require same shapes, so this should fail
-    @test_throws MethodError tile_a + tile_b
-
-    # But .+ should work (broadcasting)
-    result = tile_a .+ tile_b
-    @test result isa ct.Tile{Float32, Tuple{64, 128}}
-end
-
-@testset "comparison operations" begin
-
-@testset "float comparison operators" begin
-    tile = ct.Tile{Float32, Tuple{16}}()
-
-    @test (tile .< tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (tile .> tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (tile .<= tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (tile .>= tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (tile .== tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (tile .!= tile) isa ct.Tile{Bool, Tuple{16}}
-end
-
-@testset "integer comparison operators" begin
-    int_tile = ct.arange((16,), Int)
-
-    @test (int_tile .< int_tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (int_tile .> int_tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (int_tile .<= int_tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (int_tile .>= int_tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (int_tile .== int_tile) isa ct.Tile{Bool, Tuple{16}}
-    @test (int_tile .!= int_tile) isa ct.Tile{Bool, Tuple{16}}
-end
-
-@testset "tile vs scalar comparison" begin
-    int_tile = ct.arange((16,), Int)
-    float_tile = ct.Tile{Float32, Tuple{16}}()
-
-    @test (int_tile .< 10) isa ct.Tile{Bool, Tuple{16}}
-    @test (5 .< int_tile) isa ct.Tile{Bool, Tuple{16}}
-
-    @test (float_tile .< 2.0f0) isa ct.Tile{Bool, Tuple{16}}
-    @test (1.0f0 .> float_tile) isa ct.Tile{Bool, Tuple{16}}
-end
-
-@testset "broadcast comparison shapes" begin
-    tile_a = ct.Tile{Float32, Tuple{1, 16}}()
-    tile_b = ct.Tile{Float32, Tuple{8, 1}}()
-
-    result = tile_a .< tile_b
-    @test result isa ct.Tile{Bool, Tuple{8, 16}}
-end
-
-end
-
-@testset "power operations" begin
-
-@testset "float tile .^ float tile" begin
-    tile = ct.Tile{Float32, Tuple{16}}()
-    @test (tile .^ tile) isa ct.Tile{Float32, Tuple{16}}
-end
-
-@testset "float tile .^ scalar" begin
-    tile = ct.Tile{Float32, Tuple{16}}()
-    @test (tile .^ 2.0f0) isa ct.Tile{Float32, Tuple{16}}
-    @test (2.0f0 .^ tile) isa ct.Tile{Float32, Tuple{16}}
-end
-
-@testset "broadcast power shapes" begin
-    tile_a = ct.Tile{Float32, Tuple{1, 16}}()
-    tile_b = ct.Tile{Float32, Tuple{8, 1}}()
-    @test (tile_a .^ tile_b) isa ct.Tile{Float32, Tuple{8, 16}}
-end
-
-@testset "integer power dispatches through generic broadcast" begin
-    int_tile = ct.arange((16,), Int)
-    # Generic copy→map accepts this (no MethodError), but it will fail
-    # at codegen time since there's no ^ overlay for integers.
-    @test (int_tile .^ int_tile) isa ct.Tile
-end
-
-end
-
-@testset "multi-arg map" begin
-    a = ct.Tile{Float32, Tuple{16}}()
-    b = ct.Tile{Float32, Tuple{16}}()
-    c = ct.Tile{Float32, Tuple{16}}()
-
-    # Binary map
-    @test map(+, a, b) isa ct.Tile{Float32, Tuple{16}}
-
-    # Ternary map
-    @test map(fma, a, b, c) isa ct.Tile{Float32, Tuple{16}}
-
-    # Broadcasting goes through the .op path, not map directly
-    @test (a .+ 1.0f0) isa ct.Tile{Float32, Tuple{16}}
-    @test (1.0f0 .+ a) isa ct.Tile{Float32, Tuple{16}}
-
-    # Broadcasting with different shapes goes through .op path
-    row = ct.Tile{Float32, Tuple{4, 1}}()
-    col = ct.Tile{Float32, Tuple{1, 16}}()
-    @test (row .+ col) isa ct.Tile{Float32, Tuple{4, 16}}
-
-    # Nested broadcast expression: a .+ b .* c
-    @test (a .+ b .* c) isa ct.Tile{Float32, Tuple{16}}
-end