From 7f39e91564a30777cc1e723a1557720e08643b51 Mon Sep 17 00:00:00 2001 From: spencer-lunarg Date: Fri, 31 Oct 2025 14:51:37 -0400 Subject: [PATCH] Make BDA Alignment a dedicated chapter --- README.adoc | 2 + antora/modules/ROOT/nav.adoc | 1 + chapters/buffer_device_address.adoc | 93 +-------- chapters/buffer_device_address_alignment.adoc | 192 ++++++++++++++++++ .../buffer_device_address_alignment_1.svg | 4 + 5 files changed, 200 insertions(+), 92 deletions(-) create mode 100644 chapters/buffer_device_address_alignment.adoc create mode 100644 chapters/images/buffer_device_address_alignment_1.svg diff --git a/README.adoc b/README.adoc index 45742be..522d2c4 100644 --- a/README.adoc +++ b/README.adoc @@ -110,6 +110,8 @@ The Vulkan Guide content is also viewable from https://docs.vulkan.org/guide/lat * `VK_KHR_buffer_device_address`, `VK_EXT_buffer_device_address` +==== xref:{chapters}buffer_device_address_alignment.adoc[Buffer Device Address - Alignment] + == xref:{chapters}pipeline_cache.adoc[Pipeline Caching/Derivatives] == xref:{chapters}threading.adoc[Threading] diff --git a/antora/modules/ROOT/nav.adoc b/antora/modules/ROOT/nav.adoc index 9e24b96..36e6b98 100644 --- a/antora/modules/ROOT/nav.adoc +++ b/antora/modules/ROOT/nav.adoc @@ -41,6 +41,7 @@ *** xref:{chapters}sparse_resources.adoc[] *** xref:{chapters}protected.adoc[] *** xref:{chapters}buffer_device_address.adoc[] +**** xref:{chapters}buffer_device_address_alignment.adoc[] ** xref:{chapters}pipeline_cache.adoc[] ** xref:{chapters}threading.adoc[] ** xref:{chapters}depth.adoc[] diff --git a/chapters/buffer_device_address.adoc b/chapters/buffer_device_address.adoc index e80307b..8470d63 100644 --- a/chapters/buffer_device_address.adoc +++ b/chapters/buffer_device_address.adoc @@ -71,98 +71,7 @@ Some device migth support `bufferDeviceAddress`, but not `shaderInt64`. The way === Alignment -All variables accessed with `PhysicalStorageBuffer` must have an `Aligned` memory operand to it. - -[source,swift] ----- -%x = OpLoad %type %ptr Aligned 16 -OpStore %ptr %obj Aligned 16 ----- - -Shading languages will have a default, but can allow you to align it explicitly (ex `buffer_reference_alignment`). - -The goal of this alignment is this is a promise for how aligned this specific pointer is. -The compiler has no idea what the address will be when the shader is compiled. -By providing an alignment it can generate valid code to match the requirement. -The user is responsible to confirm the address they use is aligned to it. - -[source,glsl] ----- -layout(buffer_reference, buffer_reference_align = 64) buffer MyBDA { - uint data; -}; - -MyBDA ptr_a; // at 0x1000 -MyBDA ptr_b; // at 0x1010 -MyBDA ptr_c; // at 0x1040 - -ptr_a.data = 0; // (Aligned 64) valid! -ptr_b.data = 0; // (Aligned 64) invalid! -ptr_c.data = 0; // (Aligned 64) valid! ----- - -When deciding on an alignment, the minimum value will always be the size greater than or equal to the largest scalar/component type in the block. - -[source,glsl] ----- -// alignment must be at least 4 -layout(buffer_reference) buffer MyBDA { - vec4 a; // scalar is float -}; - -// alignment must be at least 1 -layout(buffer_reference) buffer MyBDA { - uint8_t a; // scalar is 8-bit int -}; - -// alignment must be at least 8 -layout(buffer_reference) buffer MyBDA { - uint a; // 32-bit - double b; // 64-bit -}; ----- - -=== Alignment Example - -To help explain alignment, lets take an example of loading an array of vectors - -[source,glsl] ----- -layout(buffer_reference, buffer_reference_align = ???) buffer MyBDA { - uvec4 data[]; -}; - -MyBDA ptr; // at 0x1000 -ptr.data[i] = uvec4(0); ----- - -Here we have 2 options, we could set the `Aligned` to be `4` or `16`. - -If we set alignment to `16` we are letting the compiler know it can load 16 bytes at a time, so it will hopefully do a vector load/store on the memory. - -If we set alignment to `4` the compiler will likely have no way to infer the real alignment and will now do 4 scalar int load/store on the memory. - -[NOTE] -==== -Some GPUs can do vector load/store even on unaligned addresses. -==== - -For the next case, if we had `uvec3` instead of `uvec4` such as - -[source,glsl] ----- -layout(buffer_reference, buffer_reference_align = 4, scalar) buffer MyBDA { - uvec3 data[]; -}; - -data[0]; // 0x1000 -data[1]; // 0x100C -data[2]; // 0x1018 -data[3]; // 0x1024 ----- - -We know that setting the alignment to `16` would be violated at `data[1]` and therefore we need to use an alignment of `4` in this case. -Luckily shading languages will help do this for you as seen in both link:https://godbolt.org/z/jWGKax1ed[glslang] and link:https://godbolt.org/z/Y7xW3Mfd4[slang] . +See dedicated xref:{chapters}sparse_resources.adoc#sparse-resources[BDA Alignment chapter]. === Nullptr diff --git a/chapters/buffer_device_address_alignment.adoc b/chapters/buffer_device_address_alignment.adoc new file mode 100644 index 0000000..95494af --- /dev/null +++ b/chapters/buffer_device_address_alignment.adoc @@ -0,0 +1,192 @@ +// Copyright 2024 The Khronos Group, Inc. +// SPDX-License-Identifier: CC-BY-4.0 + +// Required for both single-page and combined guide xrefs to work +ifndef::chapters[:chapters:] +ifndef::images[:images: images/] + +[[buffer-device-address-alignment]] += Buffer Device Address Alignment + +All variables accessed with `PhysicalStorageBuffer` must have an `Aligned` memory operand to it. + +[source,swift] +---- +%x = OpLoad %type %ptr Aligned 16 +OpStore %ptr %obj Aligned 16 +---- + +Shading languages will have a default, but can allow you to align it explicitly (ex `buffer_reference_alignment`). + +The goal of this alignment is this is a promise for how aligned this specific pointer is. +The compiler has no idea what the address will be when the shader is compiled. +By providing an alignment it can generate valid code to match the requirement. +The user is responsible to confirm the address they use is aligned to it. + +[source,glsl] +---- +layout(buffer_reference, buffer_reference_align = 64) buffer MyBDA { + uint data; +}; + +MyBDA ptr_a; // at 0x1000 +MyBDA ptr_b; // at 0x1010 +MyBDA ptr_c; // at 0x1040 + +ptr_a.data = 0; // (Aligned 64) valid! +ptr_b.data = 0; // (Aligned 64) invalid! +ptr_c.data = 0; // (Aligned 64) valid! +---- + +When deciding on an alignment, the minimum value will always be the size greater than or equal to the largest scalar/component type in the block. + +[source,glsl] +---- +// alignment must be at least 4 +layout(buffer_reference) buffer MyBDA { + vec4 a; // scalar is float +}; + +// alignment must be at least 1 +layout(buffer_reference) buffer MyBDA { + uint8_t a; // scalar is 8-bit int +}; + +// alignment must be at least 8 +layout(buffer_reference) buffer MyBDA { + uint a; // 32-bit + double b; // 64-bit +}; +---- + +== Setting Alignment Example + +To help explain alignment, lets take an example of loading an array of vectors + +[source,glsl] +---- +layout(buffer_reference, buffer_reference_align = ???) buffer MyBDA { + uvec4 data[]; +}; + +MyBDA ptr; // at 0x1000 +ptr.data[i] = uvec4(0); +---- + +Here we have 2 options, we could set the `Aligned` to be `4` or `16`. + +If we set alignment to `16` we are letting the compiler know it can load 16 bytes at a time, so it will hopefully do a vector load/store on the memory. + +If we set alignment to `4` the compiler will likely have no way to infer the real alignment and will now do 4 scalar int load/store on the memory. + +[NOTE] +==== +Some GPUs can do vector load/store even on unaligned addresses. +==== + +For the next case, if we had `uvec3` instead of `uvec4` such as + +[source,glsl] +---- +layout(buffer_reference, buffer_reference_align = 4, scalar) buffer MyBDA { + uvec3 data[]; +}; + +data[0]; // 0x1000 +data[1]; // 0x100C +data[2]; // 0x1018 +data[3]; // 0x1024 +---- + +We know that setting the alignment to `16` would be violated at `data[1]` and therefore we need to use an alignment of `4` in this case. +Luckily shading languages will help do this for you as seen in both link:https://godbolt.org/z/jWGKax1ed[glslang] and link:https://godbolt.org/z/Y7xW3Mfd4[slang]. + +== Matching Alignment From The Host + +When dealing with buffer device address, you are able to do a simple `memcpy` to that memory on the host, which can easily lead to bugs if you aren't careful about things being aligned. + +[NOTE] +==== +The following issues are not directly tied to Buffer Device Address, and still can occur with any uniform or storage buffer. +==== + +Take the following GLSL code as an example (link:https://godbolt.org/z/G4P8GdG9q[view online]) + +[source,glsl] +---- +// ArrayStride is 16 +struct Metadata { + uint64_t address; + uint status; +}; + +layout(buffer_reference, buffer_reference_align = 8, scalar) readonly buffer Payload { + uint count; // offset 0 + Metadata meta[]; // offset 8 +}; + +layout(set = 0, binding = 0) buffer SSBO_0 { + Payload data; +}; +---- + +Because the `uint64_t` needs be accessed at an 8-byte alignment, `glslang` (and any other compiler) will be smart and pack things as tightly as possible for you. + +The first thing you might notice is `Metadata` needs to have an array stride of 16 instead of 12. This is because otherwise `uint64_t address` will land on a non 8-byte alignment every other instance of the array. + +The next thing happening is because `struct Metedata` **largest scalar** is an 8-byte value, it knows to have the offset at `8` instead of `4`. This is why trying to change the struct to + +[source,glsl] +---- +struct Metadata { + uint status; + uint64_t address; +}; +---- + +or + +[source,glsl] +---- +struct Metadata { + uint64_t address; + uint status; + uint pad; +}; +---- + +won't change the offset from `8`. + +Here is how the memory is laid out in memory: + +image::{images}buffer_device_address_alignment_1.svg[buffer_device_address_alignment_1.svg] + +So the issue here becomes when we try to map our host memory. When you call `vkMapMemory` and get a `void*` you need to cautious that memory needs to be laid out the same as the diagram above. One way to ensure this is use a struct on host as it will match the shader code. + +[source,c++] +---- +struct Metadata { + uint64_t address; + uint32_t status; +}; + +struct Payload { + uint32_t count; + Metadata meta[2]; +} payload; + +payload.count = 2; +payload.meta[0].address = 0xDEADBEEF; +payload.meta[0].status = 20; +payload.meta[1].address = 0xDEADBEEF; +payload.meta[1].status = 5; + +void* data; +vkMapMemory(device, device_memory, 0, VK_WHOLE_SIZE, 0, &data); + +// You can also just memcpy here as well! +Payload *payload_ptr = (Payload*)data; +*payload_ptr = payload; +---- + +If we examine the C++ code here (https://godbolt.org/z/Gq75qq1x6) we can see the assembly also automatically maps the offsets the same as the GLSL code above! \ No newline at end of file diff --git a/chapters/images/buffer_device_address_alignment_1.svg b/chapters/images/buffer_device_address_alignment_1.svg new file mode 100644 index 0000000..e2674c6 --- /dev/null +++ b/chapters/images/buffer_device_address_alignment_1.svg @@ -0,0 +1,4 @@ + + + +
Payload
count
empty
meta[0].address
meta[0].status
empty
meta[1].address
meta[1].status
empty
0
4
8
16
20
24
32
36
40
\ No newline at end of file