From 7dd561b30007dc785befa1cad713505236173ea2 Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Mon, 6 Oct 2025 22:00:26 -0400 Subject: [PATCH 1/2] Use mtlheap --- candle-core/examples/test.rs | 12 ++++ candle-core/src/metal_backend/device.rs | 38 +++++++++- candle-core/src/metal_backend/mod.rs | 1 + candle-core/src/metal_backend/pool.rs | 96 +++++++++++++++++++++++++ candle-core/src/tensor_pool.rs | 26 +++++++ 5 files changed, 170 insertions(+), 3 deletions(-) create mode 100644 candle-core/examples/test.rs create mode 100644 candle-core/src/metal_backend/pool.rs create mode 100644 candle-core/src/tensor_pool.rs diff --git a/candle-core/examples/test.rs b/candle-core/examples/test.rs new file mode 100644 index 0000000000..5d341e0561 --- /dev/null +++ b/candle-core/examples/test.rs @@ -0,0 +1,12 @@ +use anyhow::Result; +use candle_core::{Device, Tensor}; + +fn main() -> Result<()> { + // This requires the code to be run with MTL_CAPTURE_ENABLED=1 + let device = Device::new_metal(0)?; + + let x = Tensor::randn(0f32, 1.0, (128, 128), &device)?; + let x1 = x.add(&x)?; + println!("{x1}"); + Ok(()) +} diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index c5cc96e208..9bd4be36fb 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -1,9 +1,10 @@ +use crate::metal_backend::pool::MetalTensorPool; use crate::{DType, Result}; use candle_metal_kernels::Kernels; use metal::{Buffer, CommandBuffer, CommandQueue, MTLResourceOptions, NSUInteger}; use std::collections::HashMap; use std::path::Path; -use std::sync::{Arc, Mutex, RwLock}; +use std::sync::{Arc, LazyLock, Mutex, RwLock}; use super::MetalError; @@ -12,7 +13,12 @@ use super::MetalError; #[cfg(target_os = "ios")] pub const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeShared; #[cfg(not(target_os = "ios"))] -pub const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeManaged; +pub const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeShared; + +// Pooling should be per-device, not a single global optional pool. Use a global +// map keyed by DeviceId to avoid cross-device contention and accidental sharing. +pub(crate) static POOLS: LazyLock>>> = + LazyLock::new(|| RwLock::new(HashMap::new())); /// Unique identifier for cuda devices. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] @@ -223,6 +229,27 @@ impl MetalDevice { commands.wait_until_completed() } + /// Ensure a tensor pool exists for this device and return it. If one does not exist, + /// it is created with the provided `size_in_bytes` capacity. + /// + /// Note: `MetalTensorPool::new` is assumed to take `(metal::Device, usize)`; adjust the + /// constructor call if your actual signature differs. + pub(crate) fn ensure_pool(&self, size_in_bytes: usize) -> Arc { + // Fast path: try read lock first + if let Ok(g) = POOLS.read() { + if let Some(p) = g.get(&self.id) { + return p.clone(); + } + } + // Slow path: upgrade to write lock and insert if still absent + let mut g = POOLS + .write() + .expect("metal tensor pool map poisoned"); + g.entry(self.id) + .or_insert_with(|| Arc::new(MetalTensorPool::new(self, size_in_bytes).unwrap())) + .clone() + } + pub fn kernels(&self) -> &Kernels { &self.kernels } @@ -314,8 +341,13 @@ impl MetalDevice { &self, size: NSUInteger, option: MTLResourceOptions, - _name: &str, + name: &str, ) -> Result> { + let pool = self.ensure_pool(1024*1024*1024); + if option == MTLResourceOptions::StorageModeShared { + return pool.allocate_buffer(size, name, option); + } + let mut buffers = self.buffers.write().map_err(MetalError::from)?; if let Some(b) = find_available_buffer(size, option, &buffers) { // Cloning also ensures we increment the strong count diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index 47c44a8711..cc523be45c 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex, PoisonError, RwLock, TryLockError}; mod device; pub use device::{DeviceId, MetalDevice, SHARED_BUFFER_STORAGE_MODE}; +mod pool; pub fn buffer_o<'a>(buffer: &'a Buffer, l: &Layout, dtype: DType) -> BufferOffset<'a> { BufferOffset { diff --git a/candle-core/src/metal_backend/pool.rs b/candle-core/src/metal_backend/pool.rs new file mode 100644 index 0000000000..8800c0788f --- /dev/null +++ b/candle-core/src/metal_backend/pool.rs @@ -0,0 +1,96 @@ +use std::sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, +}; + +use crate::{Error, Result}; + +use super::MetalDevice; +use metal::{Buffer, HeapDescriptor, MTLResourceOptions, MTLStorageMode, NSUInteger}; + +#[derive(Debug)] +struct MetalPoolInner { + device: MetalDevice, + heap: metal::Heap, + capacity: u64, + id: usize, +} + +#[derive(Clone, Debug)] +pub struct MetalTensorPool { + inner: Arc, +} + +impl MetalTensorPool { + pub fn new(device: &MetalDevice, size_in_bytes: usize) -> Result { + if size_in_bytes == 0 { + crate::bail!("metal pool size must be greater than zero") + } + let descriptor = HeapDescriptor::new(); + descriptor.set_size(size_in_bytes as NSUInteger); + descriptor.set_storage_mode(MTLStorageMode::Shared); + // descriptor.set_heap_type(MTLHeapType::Placement); + // descriptor.set_resource_options(MTLResourceOptions::StorageModePrivate); + + let heap = device.device.new_heap(&descriptor); + + static NEXT_ID: AtomicUsize = AtomicUsize::new(1); + let id = NEXT_ID.fetch_add(1, Ordering::Relaxed); + + Ok(Self { + inner: Arc::new(MetalPoolInner { + device: device.clone(), + heap, + capacity: size_in_bytes as u64, + id, + }), + }) + } + + pub fn id(&self) -> usize { + self.inner.id + } + + pub fn device(&self) -> &MetalDevice { + &self.inner.device + } + + pub fn capacity(&self) -> u64 { + self.inner.capacity + } + + pub fn allocate_buffer( + &self, + size: NSUInteger, + label: &str, + options: MTLResourceOptions, + ) -> Result> { + if size > self.inner.capacity { + crate::bail!( + "pool allocation of {size} bytes exceeds pool capacity {}", + self.inner.capacity + ) + } + let size_align = self + .inner + .device + .device + .heap_buffer_size_and_align(size, options); + let align = std::cmp::max(size_align.align, 1); + let available = self.inner.heap.max_available_size_with_alignment(align); + if size_align.size > available { + crate::bail!( + "pool allocation of {size} bytes exceeds remaining capacity {}", + available + ) + } + let buffer = self + .inner + .heap + .new_buffer(size, options) + .ok_or_else(|| Error::msg("metal heap allocation returned null"))?; + buffer.set_label(label); + println!("allocating {size} with {options:?}"); + Ok(Arc::new(buffer)) + } +} diff --git a/candle-core/src/tensor_pool.rs b/candle-core/src/tensor_pool.rs new file mode 100644 index 0000000000..6573c945c8 --- /dev/null +++ b/candle-core/src/tensor_pool.rs @@ -0,0 +1,26 @@ +#[derive(Clone, Debug)] +pub enum TensorPool { + #[cfg(feature = "metal")] + Metal(crate::metal_backend::MetalTensorPool), +} + +impl TensorPool { + #[cfg(feature = "metal")] + pub fn from_metal(pool: crate::metal_backend::MetalTensorPool) -> Self { + Self::Metal(pool) + } + + #[cfg(feature = "metal")] + pub fn as_metal(&self) -> Option<&crate::metal_backend::MetalTensorPool> { + match self { + Self::Metal(pool) => Some(pool), + } + } + + #[cfg(feature = "metal")] + pub fn into_metal(self) -> crate::metal_backend::MetalTensorPool { + match self { + Self::Metal(pool) => pool, + } + } +} From 1ec8cee09b404faa25da2a2ac0cec90a11785aa9 Mon Sep 17 00:00:00 2001 From: Eric Buehler Date: Tue, 7 Oct 2025 08:17:42 -0400 Subject: [PATCH 2/2] Tmp --- candle-core/src/metal_backend/device.rs | 14 ++++++++------ candle-core/src/metal_backend/pool.rs | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 9bd4be36fb..1f226372ba 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -270,7 +270,7 @@ impl MetalDevice { name: &str, ) -> Result> { let size = (element_count * dtype.size_in_bytes()) as NSUInteger; - self.allocate_buffer(size, MTLResourceOptions::StorageModePrivate, name) + self.allocate_buffer(size, MTLResourceOptions::StorageModeShared, name) } pub fn new_buffer_private( @@ -280,7 +280,7 @@ impl MetalDevice { name: &str, ) -> Result> { let size = (element_count * dtype.size_in_bytes()) as NSUInteger; - self.allocate_buffer(size, metal::MTLResourceOptions::StorageModePrivate, name) + self.allocate_buffer(size, metal::MTLResourceOptions::StorageModeShared, name) } /// Creates a new buffer (not necessarily zeroed). @@ -318,7 +318,7 @@ impl MetalDevice { pub fn allocate_zeros(&self, size_in_bytes: usize) -> Result> { let buffer = self.allocate_buffer( size_in_bytes as NSUInteger, - MTLResourceOptions::StorageModePrivate, + MTLResourceOptions::StorageModeShared, "allocate_zeros", )?; let command_buffer = self.command_buffer()?; @@ -343,9 +343,11 @@ impl MetalDevice { option: MTLResourceOptions, name: &str, ) -> Result> { - let pool = self.ensure_pool(1024*1024*1024); + // println!("{option:?}"); + let pool = self.ensure_pool(8*1024*1024*1024); if option == MTLResourceOptions::StorageModeShared { - return pool.allocate_buffer(size, name, option); + // println!("{name}"); + return pool.allocate_buffer(size, name, MTLResourceOptions::StorageModeShared); } let mut buffers = self.buffers.write().map_err(MetalError::from)?; @@ -357,7 +359,7 @@ impl MetalDevice { let size = buf_size(size); let subbuffers = buffers.entry((size, option)).or_insert(vec![]); - let new_buffer = self.device.new_buffer(size as NSUInteger, option); + let new_buffer = self.device.new_buffer(size as NSUInteger, MTLResourceOptions::StorageModeShared); let new_buffer = Arc::new(new_buffer); subbuffers.push(new_buffer.clone()); diff --git a/candle-core/src/metal_backend/pool.rs b/candle-core/src/metal_backend/pool.rs index 8800c0788f..2dd01a0e15 100644 --- a/candle-core/src/metal_backend/pool.rs +++ b/candle-core/src/metal_backend/pool.rs @@ -29,6 +29,7 @@ impl MetalTensorPool { let descriptor = HeapDescriptor::new(); descriptor.set_size(size_in_bytes as NSUInteger); descriptor.set_storage_mode(MTLStorageMode::Shared); + dbg!(descriptor.hazard_tracking_mode()); // descriptor.set_heap_type(MTLHeapType::Placement); // descriptor.set_resource_options(MTLResourceOptions::StorageModePrivate); @@ -71,6 +72,7 @@ impl MetalTensorPool { self.inner.capacity ) } + let options = MTLResourceOptions::StorageModeShared | MTLResourceOptions::HazardTrackingModeTracked; let size_align = self .inner .device @@ -90,7 +92,7 @@ impl MetalTensorPool { .new_buffer(size, options) .ok_or_else(|| Error::msg("metal heap allocation returned null"))?; buffer.set_label(label); - println!("allocating {size} with {options:?}"); + println!("allocating {size}/{label} with {options:?}"); Ok(Arc::new(buffer)) } }