diff --git a/examples/attention.jl b/examples/attention.jl new file mode 100644 index 0000000..ad58df5 --- /dev/null +++ b/examples/attention.jl @@ -0,0 +1,289 @@ +# Fused Multi-Head Attention example - Julia port of cuTile Python's AttentionFMHA.py sample +# +# SPDX-License-Identifier: Apache-2.0 + +using CUDA +import cuTile as ct + +import NNlib +import CUDA.GPUArrays: AllocCache, @cached # more fair NNlib comparison + +const INV_LOG_2 = Float32(1 / log(2)) +const ConstInt = ct.Constant{Int} +const ConstBool = ct.Constant{Bool} + +# cuTile kernel for Fused Multi-Head Attention (FMHA) +# +# Computes attention output for a psecific batch item and head, +# using tiling and online softmax. +# +# Layout: (D, SeqLen, Heads, Batch) +function fmha_kernel( + Q::ct.TileArray{T,4}, + K::ct.TileArray{T,4}, + V::ct.TileArray{T,4}, + Out::ct.TileArray{T,4}, + qk_scale::AbstractFloat, + input_pos::Integer, + D_K::ConstInt, # Head dimension of Q and K + D_V::ConstInt, # Head dimension of V + H::ConstInt, + TILE_M::ConstInt, + TILE_N::ConstInt, + QUERY_GROUP_SIZE::ConstInt, + CAUSAL::ConstBool, + EVEN_K::ConstBool +) where T + # Map block IDs to batch and head indices + bid_x = ct.bid(1) + bid_y = ct.bid(2) + batch_idx, head_idx = fldmod1(bid_y, H[]) # floored division and modulus for 1-based indexing + off_kv_h = cld(head_idx, QUERY_GROUP_SIZE[]) + + # Adjust qk_scale for exp2 + qk_scale = Float32(qk_scale) * Float32(INV_LOG_2) + + # Initialize offsets for current query tile (M-dimension) + # bid_x is 1-indexed, so first tile (bid_x=1) has offsets [0, TILE_M-1] + offs_m = (bid_x - 1) * TILE_M[] .+ (ct.arange((TILE_M[],), Int32) .- 1) + offs_m = offs_m .+ input_pos + offs_m = reshape(offs_m, (1, TILE_M[])) + + # local offsets for key/value tile (N-dimension) + offs_n_tile = ct.arange((TILE_N[],), Int32) .- 1 + offs_n_tile = reshape(offs_n_tile, (TILE_N[], 1)) + + # online softmax accumulators in Float32 for stability + m_i = ct.full((1, TILE_M[]), -Inf32, Float32) + l_i = ct.zeros((1, TILE_M[]), Float32) + acc = ct.zeros((D_V[], TILE_M[]), Float32) + + # query tile for this batch, head, and M-chunk + q = ct.load(Q, (1, bid_x, head_idx, batch_idx), (D_K[], TILE_M[], 1, 1)) + q = reshape(q, (D_K[], TILE_M[])) + + # m_end: one past the last query position in this tile + m_end = input_pos + bid_x * TILE_M[] + k_seqlen = K.sizes[2] + if CAUSAL[] + # Python: mask_start = (input_pos + bid_x * TILE_M) // TILE_N + # In Julia with 1-indexed bid_x: mask_start = (input_pos + (bid_x-1) * TILE_M) // TILE_N + 1 + mask_start = fld(input_pos + (bid_x - 1) * TILE_M[], TILE_N[]) + 1 + # Python: mask_start = min(mask_start, k_seqlen // TILE_N) + mask_start = min(mask_start, fld(k_seqlen, TILE_N[]) + 1) + Tc = cld(min(m_end, k_seqlen), TILE_N[]) + else + Tc = cld(k_seqlen, TILE_N[]) + # Python: mask_start = k_seqlen // TILE_N + mask_start = fld(k_seqlen, TILE_N[]) + 1 + end + + # loop over K, V blocks (N-dimension chunks) + j = Int32(1) + while j <= Tc + k = ct.load( + K, (1, j, off_kv_h, batch_idx), (D_K[], TILE_N[], 1, 1), + latency=2) + k = reshape(k, (D_K[], TILE_N[])) + k = transpose(k) + + qk = ct.zeros((TILE_N[], TILE_M[]), Float32) + qk = ct.muladd(k, q, qk) + + # Apply masking (matches Python: if (CAUSAL or not EVEN_K) and j >= mask_start) + if (CAUSAL[] || !EVEN_K[]) && j >= mask_start + offs_n = (j - 1) * TILE_N[] .+ offs_n_tile + # Build mask: start with all true + mask = ct.full((TILE_N[], TILE_M[]), true, Bool) + # out of bound mask (Python: if not EVEN_K: mask = mask & (offs_n < k_seqlen)) + if !EVEN_K[] + mask = mask .& (offs_n .< k_seqlen) + end + # causal mask (Python: if CAUSAL: mask = mask & (offs_m >= offs_n)) + if CAUSAL[] + mask = mask .& (offs_m .>= offs_n) + end + # Apply mask: set invalid positions to -Inf + qk = ifelse.(mask, qk, -Inf32) + end + + # Online Softmax Update + # Moving qk_scale multiplication after reduce_max is to improve performance + m_ij = max.(m_i, maximum(qk, dims=1) * qk_scale) + qk = qk * qk_scale .- m_ij + + # attention weights [TILE_N, TILE_M] + p = exp2.(qk) # XXX: flush_to_zero=True + l_ij = sum(p, dims=1) + alpha = exp2.(m_i .- m_ij) # XXX: flush_to_zero=True + + l_i = l_i .* alpha .+ l_ij + acc = acc .* alpha + + v = ct.load( + V, (1, j, off_kv_h, batch_idx), (D_V[], TILE_N[], 1, 1), + latency=4) + v = reshape(v, (D_V[], TILE_N[])) + p = ct.astype(p, eltype(q)) + acc = ct.muladd(v, p, acc) + m_i = m_ij + + j += Int32(1) + end + + acc = acc ./ l_i # XXX: flush_to_zero=True, rounding_mode=APPROX + acc = reshape(acc, (D_V[], TILE_M[], 1, 1)) + acc = ct.astype(acc, eltype(Out)) + ct.store(Out, (1, bid_x, head_idx, batch_idx), acc) + + return +end + +function prepare(; benchmark::Bool=false, + D_k::Int=64, + SeqLen_Q::Int=benchmark ? 4096 : 256, + Heads::Int=4, + Batch::Int=4, + D_v::Int=D_k, + SeqLen_KV::Int=SeqLen_Q, + Heads_KV::Int=Heads, + causal::Bool=false, + T::DataType=Float32) + return (; + Q = CUDA.randn(T, D_k, SeqLen_Q, Heads, Batch), + K = CUDA.randn(T, D_k, SeqLen_KV, Heads_KV, Batch), + V = CUDA.randn(T, D_v, SeqLen_KV, Heads_KV, Batch), + Out = CUDA.randn(T, D_v, SeqLen_Q, Heads, Batch), + D_k, SeqLen_Q, Heads, Batch, + D_v, SeqLen_KV, Heads_KV, causal + ) +end + +function run(data; tm::Int=64, tn::Int=64, nruns::Int=1, warmup::Int=0) + (; Q, K, V, Out, D_k, D_v, SeqLen_Q, Heads, Batch, SeqLen_KV, Heads_KV, causal) = data + grid_x = cld(SeqLen_Q, tm) + grid_y = Heads * Batch + grid = (grid_x, grid_y) + + qk_scale = 1 / sqrt(D_k) + input_pos = 0 + + query_group_size, remainder = divrem(Heads, Heads_KV) + @assert remainder == 0 + + even_k = (SeqLen_KV % tn) == 0 + + CUDA.@sync for _ in 1:warmup + ct.launch(fmha_kernel, grid, Q, K, V, Out, + qk_scale, input_pos, + ct.Constant(D_k), ct.Constant(D_v), ct.Constant(Heads), + ct.Constant(tm), ct.Constant(tn), + ct.Constant(query_group_size), + ct.Constant(causal), ct.Constant(even_k)) + end + + times = Float64[] + for _ in 1:nruns + t = CUDA.@elapsed ct.launch(fmha_kernel, grid, Q, K, V, Out, + qk_scale, input_pos, + ct.Constant(D_k), ct.Constant(D_v), ct.Constant(Heads), + ct.Constant(tm), ct.Constant(tn), + ct.Constant(query_group_size), + ct.Constant(causal), ct.Constant(even_k)) + push!(times, t * 1000) + end + + return (; Out, times) +end + +function nnlib_attention( + Q::AbstractArray{T,4}, K::AbstractArray{T,4}, V::AbstractArray{T,4}; + causal::Bool = false, +) where T + mask = causal ? NNlib.make_causal_mask(Q; dims=2) : nothing + query_group_size = cld(size(Q, 3), size(K, 3)) + if query_group_size > 1 + K, V = repeat.((K, V), inner=(1, 1, query_group_size, 1)) + end + Out, _ = NNlib.dot_product_attention(Q, K, V; mask) + return Out +end + +function verify(data, result) + # run on GPU for proper accumulation + expected = nnlib_attention(data.Q, data.K, data.V; data.causal) + @assert isapprox(expected, result.Out, rtol=1e-2) "max diff: $(maximum(abs, result.Out - expected))" +end + +#============================================================================= + Reference implementations for benchmarking +=============================================================================# + +function run_others(data; nruns::Int=1, warmup::Int=0) + (; Q, K, V, causal) = data + results = Dict{String, Vector{Float64}}() + + cache = AllocCache() + + CUDA.@sync for _ in 1:warmup + @cached cache nnlib_attention(Q, K, V; causal) + end + times = Float64[] + for _ in 1:nruns + t = CUDA.@elapsed @cached cache nnlib_attention(Q, K, V; causal) + push!(times, t * 1000) + end + results["NNlib"] = times + + return results +end + +#============================================================================= + Main +=============================================================================# + +function test_attention(::Type{T}, + D_k, SeqLen_Q, Heads, Batch, + D_v, SeqLen_KV, Heads_KV, + causal, tm, tn; + name=nothing +) where T + name = something(name, + join([ + T, + "tile=$tm×$tn", + "Q=$D_k×$SeqLen_Q", + "K=$D_k×$SeqLen_KV", + "V=$D_v×$SeqLen_KV", + "Heads=$Heads/$Heads_KV", + "Batch=$Batch", + "causal=$causal" + ], ", ")) + println("--- $name ---") + data = prepare(; T, D_k, SeqLen_Q, Heads, Batch, D_v, SeqLen_KV, Heads_KV, causal) + result = run(data; tm, tn) + verify(data, result) + println(" passed") +end + +function main() + println("--- cuTile Fused Multi-Head Attention Examples ---\n") + + for T in (Float32, Float16) + # basic + test_attention(T, 64, 256, 8, 2, 64, 256, 8, false, 32, 32) + test_attention(T, 64, 256, 8, 2, 64, 128, 4, false, 32, 64) + test_attention(T, 64, 256, 8, 2, 64, 256, 8, true, 32, 32) + + # uneven seqlen + test_attention(T, 64, 128, 4, 1, 64, 97, 2, false, 32, 32) + test_attention(T, 64, 127, 4, 1, 64, 127, 4, true, 32, 32) + + # D_k != D_v + test_attention(T, 64, 256, 8, 2, 32, 256, 4, false, 32, 32) + end + + println("\n--- All attention examples completed ---") +end + +isinteractive() || main() diff --git a/examples/attention.py b/examples/attention.py new file mode 100644 index 0000000..4039add --- /dev/null +++ b/examples/attention.py @@ -0,0 +1,337 @@ +#!/usr/bin/env python3 +""" +Fused Multi-Head Attention example - cuTile Python +Julia port equivalent with prepare/run/verify pattern for benchmarking. +""" + +# SPDX-License-Identifier: Apache-2.0 + +import torch +import numpy as np +import cuda.tile as ct +from cuda.tile import RoundingMode as RMd +from math import ceil, sqrt + +from torch.nn.functional import scaled_dot_product_attention +from torch.nn.attention import sdpa_kernel, SDPBackend + +INV_LOG_2 = 1.0 / np.log(2) +ConstInt = ct.Constant[int] +ConstBool = ct.Constant[bool] + + +@ct.kernel(occupancy=2) +def fmha_kernel(Q, K, V, Out, + qk_scale: float, + input_pos: int, + D_K: ConstInt, # Head dimension of Q and K + D_V: ConstInt, # Head dimension of V + H: ConstInt, + TILE_M: ConstInt, + TILE_N: ConstInt, + QUERY_GROUP_SIZE: ConstInt, + CAUSAL: ConstBool, + EVEN_K: ConstBool): + """ + cuTile kernel for Fused Multi-Head Attention (FMHA). + Computes attention output for a specific batch item and head, using tiling and online softmax. + + Layout: (Batch, Heads, SeqLen, D) + """ + # Map block IDs to batch and head indices + bid_x = ct.bid(0) + bid_y = ct.bid(1) + batch_idx = bid_y // H + head_idx = bid_y % H + off_kv_h = head_idx // QUERY_GROUP_SIZE + + # Adjust qk_scale for exp2 + qk_scale = qk_scale * INV_LOG_2 + + # Initialize offsets for current query tile (M-dimension) + offs_m = bid_x * TILE_M + ct.arange(TILE_M, dtype=np.int32) # [TILE_M] + offs_m += input_pos + offs_m = offs_m[:, None] # [TILE_M, 1] + + # Initialize local offsets for key/value tile (N-dimension) + offs_n_tile = ct.arange(TILE_N, dtype=np.int32) # [TILE_N] + offs_n_tile = offs_n_tile[None, :] # [1, TILE_N] + + # Initialize online softmax accumulators in float32 for stability + m_i = ct.full((TILE_M, 1), -np.inf, dtype=np.float32) + l_i = ct.full((TILE_M, 1), 0.0, dtype=np.float32) + acc = ct.full((TILE_M, D_V), 0.0, dtype=np.float32) + + # Load query tile for this batch, head, and M-chunk + q = ct.load( + Q, index=(batch_idx, head_idx, bid_x, 0), shape=(1, 1, TILE_M, D_K) + ).reshape((TILE_M, D_K)) # [TILE_M, D_K] + + # loop over k, v and update accumulator + m_end = input_pos + (bid_x + 1) * TILE_M + k_seqlen = K.shape[2] + if CAUSAL: + # when kv pos could exceed q pos + mask_start = (input_pos + bid_x * TILE_M) // TILE_N + # when kv pos could exceed k_seqlen + mask_start = min(mask_start, k_seqlen // TILE_N) + Tc = ct.cdiv(min(m_end, k_seqlen), TILE_N) + else: + Tc = ct.cdiv(k_seqlen, TILE_N) + mask_start = k_seqlen // TILE_N + + # Loop over K, V blocks (N-dimension chunks) + for j in range(0, Tc): + # --- Compute QK product --- + k = ct.load( + K, index=(batch_idx, off_kv_h, 0, j), shape=(1, 1, D_K, TILE_N), + order=(0, 1, 3, 2), + latency=2, + ) + k = k.reshape((D_K, TILE_N)) # [D_K, TILE_N] + qk = ct.full((TILE_M, TILE_N), 0., dtype=np.float32) + qk = ct.mma(q, k, qk) # [TILE_M, TILE_N] + + # --- Apply Causal Masking --- + if (CAUSAL or not EVEN_K) and j >= mask_start: + offs_n = j * TILE_N + offs_n_tile + mask = ct.full((TILE_M, TILE_N), True, dtype=np.bool) + # out of bound mask + if not EVEN_K: + mask = mask & (offs_n < k_seqlen) + # causal mask + if CAUSAL: + mask = mask & (offs_m >= offs_n) # [TILE_M, TILE_N] + mask = ct.where(mask, 0.0, -np.inf) # [TILE_M, TILE_N] + qk += mask + + # --- Online Softmax Update --- + # Moving qk_scale multiplication after reduce_max is to improve performance. + m_ij = max(m_i, ct.max(qk, axis=-1, keepdims=True) * qk_scale) + qk = qk * qk_scale - m_ij # [TILE_M, TILE_N] + + # attention weights + p = ct.exp2(qk, flush_to_zero=True) # [TILE_M, TILE_N] + l_ij = ct.sum(p, axis=-1, keepdims=True) # [TILE_M, 1] + alpha = ct.exp2(m_i - m_ij, flush_to_zero=True) # [TILE_M, 1] + # update m_i and l_i + l_i = l_i * alpha + l_ij # [TILE_M, 1] + # scale acc + acc = acc * alpha # [TILE_M, D_V] + + # --- Compute PV product --- + v = ct.load( + V, index=(batch_idx, off_kv_h, j, 0), shape=(1, 1, TILE_N, D_V), + latency=4, + ).reshape((TILE_N, D_V)) # [TILE_N, D_V] + p = p.astype(Q.dtype) + acc = ct.mma(p, v, acc) # [TILE_M, D_V] + m_i = m_ij # [TILE_M, 1] + + # --- Final Normalization and Store --- + acc = ct.truediv(acc, l_i, flush_to_zero=True, rounding_mode=RMd.APPROX) + acc = acc.reshape((1, 1, TILE_M, D_V)).astype(Out.dtype) + ct.store(Out, index=(batch_idx, head_idx, bid_x, 0), tile=acc) + + +#============================================================================= +# Example harness +#============================================================================= + +def prepare(*, benchmark: bool = False, + D_k: int = 64, + SeqLen_Q: int = None, + Heads: int = 4, + Batch: int = 4, + D_v: int = None, + SeqLen_KV: int = None, + Heads_KV: int = None, + causal: bool = False, + dtype=torch.float32): + """Allocate and initialize data for FMHA.""" + if SeqLen_Q is None: + SeqLen_Q = 4096 if benchmark else 256 + if D_v is None: + D_v = D_k + if SeqLen_KV is None: + SeqLen_KV = SeqLen_Q + if Heads_KV is None: + Heads_KV = Heads + + # Layout: (Batch, Heads, SeqLen, D) + return { + "Q": torch.randn(Batch, Heads, SeqLen_Q, D_k, dtype=dtype, device='cuda'), + "K": torch.randn(Batch, Heads_KV, SeqLen_KV, D_k, dtype=dtype, device='cuda'), + "V": torch.randn(Batch, Heads_KV, SeqLen_KV, D_v, dtype=dtype, device='cuda'), + "Out": torch.empty(Batch, Heads, SeqLen_Q, D_v, dtype=dtype, device='cuda'), + "D_k": D_k, + "D_v": D_v, + "SeqLen_Q": SeqLen_Q, + "SeqLen_KV": SeqLen_KV, + "Heads": Heads, + "Heads_KV": Heads_KV, + "Batch": Batch, + "causal": causal, + } + + +def run(data, *, tm: int = 64, tn: int = 64, nruns: int = 1, warmup: int = 0): + """Run FMHA kernel with timing.""" + Q, K, V, Out = data["Q"], data["K"], data["V"], data["Out"] + D_k, D_v = data["D_k"], data["D_v"] + SeqLen_Q, SeqLen_KV = data["SeqLen_Q"], data["SeqLen_KV"] + Heads, Heads_KV, Batch = data["Heads"], data["Heads_KV"], data["Batch"] + causal = data["causal"] + + grid_x = ceil(SeqLen_Q / tm) + grid_y = Heads * Batch + grid = (grid_x, grid_y, 1) + + qk_scale = 1.0 / sqrt(D_k) + input_pos = 0 + + query_group_size, remainder = divmod(Heads, Heads_KV) + assert remainder == 0, "Heads must be divisible by Heads_KV" + + even_k = (SeqLen_KV % tn) == 0 + + stream = torch.cuda.current_stream() + + # Warmup + for _ in range(warmup): + ct.launch(stream, grid, fmha_kernel, ( + Q, K, V, Out, + qk_scale, input_pos, + D_k, D_v, Heads, + tm, tn, + query_group_size, + causal, even_k + )) + torch.cuda.synchronize() + + # Timed runs + times = [] + for _ in range(nruns): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + ct.launch(stream, grid, fmha_kernel, ( + Q, K, V, Out, + qk_scale, input_pos, + D_k, D_v, Heads, + tm, tn, + query_group_size, + causal, even_k + )) + end.record() + torch.cuda.synchronize() + times.append(start.elapsed_time(end)) # ms + + return {"Out": Out, "times": times} + + +def torch_sdpa(Q, K, V, *, causal: bool = False, enable_gqa: bool = False): + """Reference scaled dot-product attention using PyTorch.""" + # Use MATH backend as fallback (works with all dtypes) + # cuDNN/Flash only support float16/bfloat16 + with sdpa_kernel(SDPBackend.MATH): + return scaled_dot_product_attention(Q, K, V, is_causal=causal, enable_gqa=enable_gqa) + + +def verify(data, result): + """Verify FMHA results against reference implementation.""" + Q, K, V = data["Q"], data["K"], data["V"] + causal = data["causal"] + Heads, Heads_KV = data["Heads"], data["Heads_KV"] + + enable_gqa = Heads != Heads_KV + expected = torch_sdpa(Q, K, V, causal=causal, enable_gqa=enable_gqa) + actual = result["Out"] + + max_diff = float(torch.max(torch.abs(actual - expected))) + assert torch.allclose(actual, expected, rtol=1e-2, atol=1e-2), \ + f"FMHA mismatch! max diff: {max_diff}" + + +#============================================================================= +# Reference implementations for benchmarking +#============================================================================= + +def run_others(data, *, nruns: int = 1, warmup: int = 0): + """Run reference implementations for comparison.""" + results = {} + Q, K, V = data["Q"], data["K"], data["V"] + causal = data["causal"] + Heads, Heads_KV = data["Heads"], data["Heads_KV"] + enable_gqa = Heads != Heads_KV + + # PyTorch SDPA (uses cuDNN or Flash Attention) + for _ in range(warmup): + _ = torch_sdpa(Q, K, V, causal=causal, enable_gqa=enable_gqa) + torch.cuda.synchronize() + + times_torch = [] + for _ in range(nruns): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + start.record() + _ = torch_sdpa(Q, K, V, causal=causal, enable_gqa=enable_gqa) + end.record() + torch.cuda.synchronize() + times_torch.append(start.elapsed_time(end)) + results["PyTorch SDPA"] = times_torch + + return results + + +#============================================================================= +# Main +#============================================================================= + +def test_attention(dtype, D_k, SeqLen_Q, Heads, Batch, D_v, SeqLen_KV, Heads_KV, + causal, tm, tn, name=None): + """Test attention with given parameters.""" + if name is None: + dtype_name = str(dtype).split('.')[-1] + name = ", ".join([ + dtype_name, + f"tile={tm}x{tn}", + f"Q={D_k}x{SeqLen_Q}", + f"K={D_k}x{SeqLen_KV}", + f"V={D_v}x{SeqLen_KV}", + f"Heads={Heads}/{Heads_KV}", + f"Batch={Batch}", + f"causal={causal}" + ]) + print(f"--- {name} ---") + data = prepare( + D_k=D_k, SeqLen_Q=SeqLen_Q, Heads=Heads, Batch=Batch, + D_v=D_v, SeqLen_KV=SeqLen_KV, Heads_KV=Heads_KV, + causal=causal, dtype=dtype + ) + result = run(data, tm=tm, tn=tn) + verify(data, result) + print(" passed") + + +def main(): + print("--- cuTile Fused Multi-Head Attention Examples ---\n") + + for dtype in (torch.float32, torch.float16): + # basic + test_attention(dtype, 64, 256, 8, 2, 64, 256, 8, False, 32, 32) + test_attention(dtype, 64, 256, 8, 2, 64, 128, 4, False, 32, 64) + test_attention(dtype, 64, 256, 8, 2, 64, 256, 8, True, 32, 32) + + # uneven seqlen + test_attention(dtype, 64, 127, 4, 1, 64, 127, 4, False, 32, 32) + test_attention(dtype, 64, 128, 4, 1, 64, 97, 2, False, 32, 32) + + # D_k != D_v + test_attention(dtype, 64, 256, 8, 2, 64, 256, 8, False, 32, 32) + + print("\n--- All attention examples completed ---") + + +if __name__ == "__main__": + main() diff --git a/examples/transpose.py b/examples/transpose.py index e299ef3..1996a3b 100644 --- a/examples/transpose.py +++ b/examples/transpose.py @@ -12,7 +12,7 @@ def transpose_cutile_kernel(input, output, tile_m: ct.Constant[int], tile_n: ct. pid_m = ct.bid(0) pid_n = ct.bid(1) tile = ct.load(input, index=(pid_m, pid_n), shape=(tile_m, tile_n)) - tile_t = transpose(tile) + tile_t = ct.transpose(tile) ct.store(output, index=(pid_n, pid_m), tile=tile_t) diff --git a/test/Project.toml b/test/Project.toml index 278b9d8..1310e63 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -5,6 +5,7 @@ DLFP8Types = "f4c16678-4a16-415b-82ef-ed337c5d6c7c" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" FileCheck = "4e644321-382b-4b05-b0b6-5d23c3d944fb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" ParallelTestRunner = "d3525ed8-44d0-4b2c-a655-542cee43accc" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"