From b89b700d49b58e78b3c68daab23bd34507567d54 Mon Sep 17 00:00:00 2001 From: Paul Szerlip Date: Mon, 28 Jul 2025 10:36:17 -0400 Subject: [PATCH 1/7] Due to crashes during either new_metal device calls or resource allocation (see [issue on Candle main](https://github.com/huggingface/candle/issues/2322)) 1. Replaced references to MTLResourceOptions::StorageMode* enums to METAL_SHARED_BUFFER_STORAGE_MODE Tested working on iOS 18.2 iphone14 pro and iphone 16 pro. --- candle-core/src/metal_backend/mod.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index a77c37168b..0eb3167b38 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -13,6 +13,13 @@ use std::sync::{Arc, Mutex, PoisonError, RwLock, TryLockError}; mod device; pub use device::{DeviceId, MetalDevice}; +// iOS and macOS have different storage modes for shared buffers. +// due to the GPU/CPU management differences. +#[cfg(target_os = "ios")] +const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeShared; +#[cfg(not(target_os = "ios"))] +const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeManaged; + pub fn buffer_o<'a>(buffer: &'a Buffer, l: &Layout, dtype: DType) -> BufferOffset<'a> { BufferOffset { buffer, @@ -2064,7 +2071,7 @@ impl BackendDevice for MetalDevice { let seed = Arc::new(Mutex::new(device.new_buffer_with_data( [299792458].as_ptr() as *const c_void, 4, - MTLResourceOptions::StorageModeManaged, + SHARED_BUFFER_STORAGE_MODE, ))); let commands = device::Commands::new(command_queue)?; Ok(Self { From 2d1906330b1c11d2a03c5440ea50a3130607ec4c Mon Sep 17 00:00:00 2001 From: Paul Szerlip Date: Mon, 28 Jul 2025 10:54:15 -0400 Subject: [PATCH 2/7] feat(ios-MTLResourceOptions-managed-everywhere-it-is-called) --- candle-core/src/metal_backend/device.rs | 13 ++++++++++--- candle-core/src/metal_backend/mod.rs | 9 +-------- .../examples/metal_benchmarks.rs | 5 ++++- candle-metal-kernels/src/tests.rs | 17 ++++++++++------- candle-metal-kernels/tmp/affine.rs | 3 +++ candle-metal-kernels/tmp/binary.rs | 3 +++ candle-metal-kernels/tmp/cast.rs | 3 +++ candle-metal-kernels/tmp/unary.rs | 3 +++ 8 files changed, 37 insertions(+), 19 deletions(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index f249202ddd..c5cc96e208 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -7,6 +7,13 @@ use std::sync::{Arc, Mutex, RwLock}; use super::MetalError; +// iOS and macOS have different storage modes for shared buffers. +// due to the GPU/CPU management differences. +#[cfg(target_os = "ios")] +pub const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeShared; +#[cfg(not(target_os = "ios"))] +pub const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeManaged; + /// Unique identifier for cuda devices. #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] pub struct DeviceId(usize); @@ -255,7 +262,7 @@ impl MetalDevice { /// synchronization when the CPU memory is modified /// Used as a bridge to gather data back from the GPU pub fn new_buffer_managed(&self, size: NSUInteger) -> Result> { - self.allocate_buffer(size, MTLResourceOptions::StorageModeManaged, "managed") + self.allocate_buffer(size, SHARED_BUFFER_STORAGE_MODE, "managed") } /// Creates a new buffer from data. @@ -268,12 +275,12 @@ impl MetalDevice { let new_buffer = self.device.new_buffer_with_data( data.as_ptr().cast(), size, - MTLResourceOptions::StorageModeManaged, + SHARED_BUFFER_STORAGE_MODE, ); let mut buffers = self.buffers.write().map_err(MetalError::from)?; let subbuffers = buffers - .entry((size, MTLResourceOptions::StorageModeManaged)) + .entry((size, SHARED_BUFFER_STORAGE_MODE)) .or_insert(vec![]); let new_buffer = Arc::new(new_buffer); diff --git a/candle-core/src/metal_backend/mod.rs b/candle-core/src/metal_backend/mod.rs index 0eb3167b38..6ae14db888 100644 --- a/candle-core/src/metal_backend/mod.rs +++ b/candle-core/src/metal_backend/mod.rs @@ -11,14 +11,7 @@ use std::ffi::c_void; use std::sync::{Arc, Mutex, PoisonError, RwLock, TryLockError}; mod device; -pub use device::{DeviceId, MetalDevice}; - -// iOS and macOS have different storage modes for shared buffers. -// due to the GPU/CPU management differences. -#[cfg(target_os = "ios")] -const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeShared; -#[cfg(not(target_os = "ios"))] -const SHARED_BUFFER_STORAGE_MODE: MTLResourceOptions = MTLResourceOptions::StorageModeManaged; +pub use device::{DeviceId, MetalDevice, SHARED_BUFFER_STORAGE_MODE}; pub fn buffer_o<'a>(buffer: &'a Buffer, l: &Layout, dtype: DType) -> BufferOffset<'a> { BufferOffset { diff --git a/candle-metal-kernels/examples/metal_benchmarks.rs b/candle-metal-kernels/examples/metal_benchmarks.rs index f0de21e0c2..cb5614889a 100644 --- a/candle-metal-kernels/examples/metal_benchmarks.rs +++ b/candle-metal-kernels/examples/metal_benchmarks.rs @@ -13,7 +13,10 @@ fn run_gemm(f32: bool, n: usize) -> Result<()> { let (b, m, n, k) = (1, n, n, n); let kernels = candle_metal_kernels::Kernels::new(); let command_queue = device.new_command_queue(); - let options = metal::MTLResourceOptions::StorageModeManaged; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let (lhs, rhs) = if f32 { let lhs: Vec = (0..b * m * k).map(|f| f as f32).collect(); diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index 5934cffb32..83c15e110b 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -13,6 +13,9 @@ fn read_to_vec(buffer: &Buffer, n: usize) -> Vec { } fn new_buffer(device: &Device, data: &[T]) -> Buffer { + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] let options = MTLResourceOptions::StorageModeManaged; let ptr = data.as_ptr() as *const c_void; let size = std::mem::size_of_val(data) as u64; @@ -69,7 +72,7 @@ fn run_binary(x: &[T], y: &[T], name: binary::contiguous::Kernel) -> V let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = MTLResourceOptions::StorageModeManaged; + let options = SHARED_BUFFER_STORAGE_MODE; let left = new_buffer(&device, x); let right = new_buffer(&device, y); let output = device.new_buffer(std::mem::size_of_val(x) as u64, options); @@ -311,7 +314,7 @@ fn run_cast(v: &[T], name: &'static str) -> Vec { let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); let input = new_buffer(&device, v); - let options = MTLResourceOptions::StorageModeManaged; + let options = SHARED_BUFFER_STORAGE_MODE; let size = (v.len() * std::mem::size_of::()) as u64; let output = device.new_buffer(size, options); @@ -874,7 +877,7 @@ fn run_reduce( let command_buffer = command_queue.new_command_buffer(); let input = new_buffer(&device, v); - let options = MTLResourceOptions::StorageModeManaged; + let options = SHARED_BUFFER_STORAGE_MODE; let output = device.new_buffer((out_length * core::mem::size_of::()) as u64, options); let shape = vec![in_length]; match call_reduce_contiguous( @@ -1188,7 +1191,7 @@ fn run_where_cond( let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = MTLResourceOptions::StorageModeManaged; + let options = SHARED_BUFFER_STORAGE_MODE; let length = cond.len(); let cond = device.new_buffer_with_data( @@ -1299,7 +1302,7 @@ fn run_mlx_gemm( let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = MTLResourceOptions::StorageModeManaged; + let options = SHARED_BUFFER_STORAGE_MODE; let lhs = device.new_buffer_with_data( lhs.as_ptr() as *const core::ffi::c_void, @@ -1444,7 +1447,7 @@ fn run_random(name: &'static str, seed: u32, length: usize, a: f32, b: let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = MTLResourceOptions::StorageModeManaged; + let options = SHARED_BUFFER_STORAGE_MODE; let output = device.new_buffer((length * core::mem::size_of::()) as NSUInteger, options); let seed = device.new_buffer_with_data( @@ -1570,7 +1573,7 @@ fn run_scatter_add( let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = MTLResourceOptions::StorageModeManaged; + let options = SHARED_BUFFER_STORAGE_MODE; let input_buffer = new_buffer(&device, input); let ids_buffer = new_buffer(&device, ids); let output = device.new_buffer(std::mem::size_of_val(input) as u64, options); diff --git a/candle-metal-kernels/tmp/affine.rs b/candle-metal-kernels/tmp/affine.rs index cd019056c7..a6b2777d0a 100644 --- a/candle-metal-kernels/tmp/affine.rs +++ b/candle-metal-kernels/tmp/affine.rs @@ -30,6 +30,9 @@ fn main() { fn run_affine_bench(device: &Device, kernels: &Kernels, v: &[T]) { let command_queue = device.new_command_queue(); + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] let options = MTLResourceOptions::StorageModeManaged; let iterations = 10000; diff --git a/candle-metal-kernels/tmp/binary.rs b/candle-metal-kernels/tmp/binary.rs index af5a8bdc62..e5b7e11c54 100644 --- a/candle-metal-kernels/tmp/binary.rs +++ b/candle-metal-kernels/tmp/binary.rs @@ -94,6 +94,9 @@ fn run_binary_bench( strided: [binary::strided::Kernel; 4], ) { let command_queue = device.new_command_queue(); + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] let options = MTLResourceOptions::StorageModeManaged; let iterations = 1000; diff --git a/candle-metal-kernels/tmp/cast.rs b/candle-metal-kernels/tmp/cast.rs index 090f510d16..73c0dea3a6 100644 --- a/candle-metal-kernels/tmp/cast.rs +++ b/candle-metal-kernels/tmp/cast.rs @@ -37,6 +37,9 @@ fn run_cast_bench( contiguous: &[&'static str], ) { let command_queue = device.new_command_queue(); + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] let options = MTLResourceOptions::StorageModeManaged; let iterations = 1000; diff --git a/candle-metal-kernels/tmp/unary.rs b/candle-metal-kernels/tmp/unary.rs index 66cf25c0c8..64f597b55a 100644 --- a/candle-metal-kernels/tmp/unary.rs +++ b/candle-metal-kernels/tmp/unary.rs @@ -112,6 +112,9 @@ fn run_unary_bench( strided: [unary::strided::Kernel; 7], ) { let command_queue = device.new_command_queue(); + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] let options = MTLResourceOptions::StorageModeManaged; let iterations = 10000; From fd809385a5e0926bdff883a67342bbfc8a099044 Mon Sep 17 00:00:00 2001 From: Paul Szerlip Date: Tue, 29 Jul 2025 13:03:57 -0400 Subject: [PATCH 3/7] feat(correct-the-benchmark-execution) --- .../examples/metal_benchmarks.rs | 4 +-- candle-metal-kernels/src/tests.rs | 35 +++++++++++++++---- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/candle-metal-kernels/examples/metal_benchmarks.rs b/candle-metal-kernels/examples/metal_benchmarks.rs index cb5614889a..864a239f71 100644 --- a/candle-metal-kernels/examples/metal_benchmarks.rs +++ b/candle-metal-kernels/examples/metal_benchmarks.rs @@ -14,9 +14,9 @@ fn run_gemm(f32: bool, n: usize) -> Result<()> { let kernels = candle_metal_kernels::Kernels::new(); let command_queue = device.new_command_queue(); #[cfg(target_os = "ios")] - let options = MTLResourceOptions::StorageModeShared; + let options = metal::MTLResourceOptions::StorageModeShared; #[cfg(not(target_os = "ios"))] - let options = MTLResourceOptions::StorageModeManaged; + let options = metal::MTLResourceOptions::StorageModeManaged; let (lhs, rhs) = if f32 { let lhs: Vec = (0..b * m * k).map(|f| f as f32).collect(); diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index 83c15e110b..abea22b0c1 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -72,7 +72,10 @@ fn run_binary(x: &[T], y: &[T], name: binary::contiguous::Kernel) -> V let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = SHARED_BUFFER_STORAGE_MODE; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let left = new_buffer(&device, x); let right = new_buffer(&device, y); let output = device.new_buffer(std::mem::size_of_val(x) as u64, options); @@ -314,7 +317,10 @@ fn run_cast(v: &[T], name: &'static str) -> Vec { let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); let input = new_buffer(&device, v); - let options = SHARED_BUFFER_STORAGE_MODE; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let size = (v.len() * std::mem::size_of::()) as u64; let output = device.new_buffer(size, options); @@ -877,7 +883,10 @@ fn run_reduce( let command_buffer = command_queue.new_command_buffer(); let input = new_buffer(&device, v); - let options = SHARED_BUFFER_STORAGE_MODE; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let output = device.new_buffer((out_length * core::mem::size_of::()) as u64, options); let shape = vec![in_length]; match call_reduce_contiguous( @@ -1191,7 +1200,10 @@ fn run_where_cond( let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = SHARED_BUFFER_STORAGE_MODE; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let length = cond.len(); let cond = device.new_buffer_with_data( @@ -1302,7 +1314,10 @@ fn run_mlx_gemm( let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = SHARED_BUFFER_STORAGE_MODE; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let lhs = device.new_buffer_with_data( lhs.as_ptr() as *const core::ffi::c_void, @@ -1447,7 +1462,10 @@ fn run_random(name: &'static str, seed: u32, length: usize, a: f32, b: let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = SHARED_BUFFER_STORAGE_MODE; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let output = device.new_buffer((length * core::mem::size_of::()) as NSUInteger, options); let seed = device.new_buffer_with_data( @@ -1573,7 +1591,10 @@ fn run_scatter_add( let kernels = Kernels::new(); let command_queue = device.new_command_queue(); let command_buffer = command_queue.new_command_buffer(); - let options = SHARED_BUFFER_STORAGE_MODE; + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; let input_buffer = new_buffer(&device, input); let ids_buffer = new_buffer(&device, ids); let output = device.new_buffer(std::mem::size_of_val(input) as u64, options); From 88a35789ae05e0d2e8ff212a3146e65b99296a61 Mon Sep 17 00:00:00 2001 From: Paul Szerlip Date: Tue, 29 Jul 2025 14:44:37 -0400 Subject: [PATCH 4/7] feat(fixed-buffer-private-creation-on-ios-no-cpu-access-and-i64-input-issues-stricter-on-ios) --- candle-metal-kernels/src/sort.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/candle-metal-kernels/src/sort.rs b/candle-metal-kernels/src/sort.rs index e4140eb38b..1aee8fbd7f 100644 --- a/candle-metal-kernels/src/sort.rs +++ b/candle-metal-kernels/src/sort.rs @@ -92,11 +92,11 @@ pub fn multi_block_sort( &src, &mut dev_vals_0, &mut dev_idxs_0, - /* size_sorted_axis */ ncols as i32, - /* stride_sorted_axis */ 1i32, - /* nc_dim */ 1i32, - /* nc_shape */ nrows as i32, - /* nc_str */ ncols as i32 + /* size_sorted_axis */ ncols as i64, + /* stride_sorted_axis */ 1i64, + /* nc_dim */ 1i64, + /* nc_shape */ nrows as i64, + /* nc_str */ ncols as i64 ) ); let thread_group_count = MTLSize { @@ -243,11 +243,11 @@ pub fn block_sort( ( &src, dst, - ncols as i32, - 1i32, - 1i32, - ncols as i32, - ncols as i32 + ncols as i64, + 1i64, + 1i64, + ncols as i64, + ncols as i64 ) ); let thread_group_count = MTLSize { From a452089fca78c5abdba17c87f76e248ec6735e3c Mon Sep 17 00:00:00 2001 From: Paul Szerlip Date: Tue, 29 Jul 2025 16:09:42 -0400 Subject: [PATCH 5/7] feat(power-of-two-incorrect-calculations-for-size-2-and-pot-plus-1) --- candle-core/src/metal_backend/device.rs | 4 ++- candle-metal-kernels/src/tests.rs | 39 +++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index c5cc96e208..24068147b7 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -323,6 +323,7 @@ impl MetalDevice { } let size = buf_size(size); + println!("Allocating new buffer of size {size} with option {option:?}"); let subbuffers = buffers.entry((size, option)).or_insert(vec![]); let new_buffer = self.device.new_buffer(size as NSUInteger, option); @@ -354,7 +355,8 @@ impl MetalDevice { } fn buf_size(size: NSUInteger) -> NSUInteger { - size.saturating_sub(1).next_power_of_two() as NSUInteger + // size.saturating_sub(1).next_power_of_two() as NSUInteger + size.next_power_of_two() as NSUInteger } fn find_available_buffer( diff --git a/candle-metal-kernels/src/tests.rs b/candle-metal-kernels/src/tests.rs index abea22b0c1..1bf210fed9 100644 --- a/candle-metal-kernels/src/tests.rs +++ b/candle-metal-kernels/src/tests.rs @@ -525,6 +525,45 @@ fn cast_i64() { assert_eq!(results, v_u8); } +// This test specifically targets the buffer size mismatch for scalar casting. +#[test] +fn cast_scalar() { + let device = device(); + let kernels = Kernels::new(); + let command_queue = device.new_command_queue(); + let command_buffer = command_queue.new_command_buffer(); + + let input_data = &[1.0f32]; + let input_buffer = new_buffer(&device, input_data); + let input = BufferOffset::zero_offset(&input_buffer); + + #[cfg(target_os = "ios")] + let options = MTLResourceOptions::StorageModeShared; + #[cfg(not(target_os = "ios"))] + let options = MTLResourceOptions::StorageModeManaged; + + // This is the BUG: The output buffer is allocated with the size of the + // INPUT dtype (f32 = 4 bytes) instead of the OUTPUT dtype (bf16 = 2 bytes). + // The error message shows length=1, but it should be 2. Let's replicate + // the likely buggy allocation size calculation to trigger the validation error. + let buggy_size = (1 * std::mem::size_of::()) as u64; // Incorrectly using f32 size + let output_buffer = device.new_buffer(buggy_size, options); + + // This call should fail the Metal validation. + call_cast_contiguous( + &device, + command_buffer, + &kernels, + "cast_f32_bf16", + 1, // el_count = 1 + input, + &output_buffer, + ) + .unwrap(); + command_buffer.commit(); + command_buffer.wait_until_completed(); +} + fn run_affine(v: &[T], mul: f64, add: f64) -> Vec { let device = device(); let kernels = Kernels::new(); From bb6fdb6fb6195b84e3c773f499a93e051d007b98 Mon Sep 17 00:00:00 2001 From: Paul Szerlip Date: Tue, 29 Jul 2025 16:14:42 -0400 Subject: [PATCH 6/7] feat(no-printing-tested-scalar-working) --- candle-core/src/metal_backend/device.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/candle-core/src/metal_backend/device.rs b/candle-core/src/metal_backend/device.rs index 24068147b7..752a9f86f7 100644 --- a/candle-core/src/metal_backend/device.rs +++ b/candle-core/src/metal_backend/device.rs @@ -323,7 +323,6 @@ impl MetalDevice { } let size = buf_size(size); - println!("Allocating new buffer of size {size} with option {option:?}"); let subbuffers = buffers.entry((size, option)).or_insert(vec![]); let new_buffer = self.device.new_buffer(size as NSUInteger, option); From 2347ece3c769e4f093b84f439f077829313a1e78 Mon Sep 17 00:00:00 2001 From: Paul Szerlip Date: Wed, 30 Jul 2025 08:49:35 -0400 Subject: [PATCH 7/7] feat(scalar-empty-buffer-causes-failure-on-ios) --- candle-metal-kernels/src/lib.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/candle-metal-kernels/src/lib.rs b/candle-metal-kernels/src/lib.rs index c03b1c1370..5c14dd62ad 100644 --- a/candle-metal-kernels/src/lib.rs +++ b/candle-metal-kernels/src/lib.rs @@ -625,6 +625,11 @@ pub fn call_binary_strided( let (thread_group_count, thread_group_size) = linear_split(&pipeline, width); encoder.set_compute_pipeline_state(&pipeline); + + let dummy = &[0usize]; + let shape = if num_dims == 0 { dummy } else { shape }; + let left_strides = if num_dims == 0 { dummy } else { left_strides }; + let right_strides = if num_dims == 0 { dummy } else { right_strides }; set_params!( encoder, (