diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp index 490d6807..7042ecd6 100644 --- a/src/d3d11/d3d11_context_impl.cpp +++ b/src/d3d11/d3d11_context_impl.cpp @@ -1561,7 +1561,7 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([IndexType, IndexBufferOffset, Primitive, ArgBuffer = bindable->buffer(), AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); enc.bumpVisibilityResultOffset(); auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer(); auto &cmd = enc.encodeRenderCommand(); @@ -1596,7 +1596,7 @@ template class MTLD3D11DeviceContextImplBase : p } if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([Primitive, ArgBuffer = bindable->buffer(), AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); enc.bumpVisibilityResultOffset(); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandDrawIndirect; @@ -1614,7 +1614,7 @@ template class MTLD3D11DeviceContextImplBase : p auto max_object_threadgroups = max_object_threadgroups_; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo); @@ -1645,7 +1645,7 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo); @@ -1677,7 +1677,7 @@ template class MTLD3D11DeviceContextImplBase : p auto max_object_threadgroups = max_object_threadgroups_; if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); auto PatchPerGroup = 32 / enc.tess_threads_per_patch; @@ -1711,7 +1711,7 @@ template class MTLD3D11DeviceContextImplBase : p if (auto bindable = reinterpret_cast(pBufferForArgs)) { EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ); auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4); auto PatchPerGroup = 32 / enc.tess_threads_per_patch; @@ -4668,7 +4668,7 @@ template class MTLD3D11DeviceContextImplBase : p auto &so_slot0 = state_.StreamOutput.Targets[0]; if (so_slot0.Offset == 0xFFFFFFFF) { EmitST([slot0 = so_slot0.Buffer->buffer()](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetVertexBuffer; cmd.buffer = buffer->buffer();; @@ -4679,7 +4679,7 @@ template class MTLD3D11DeviceContextImplBase : p }); } else { EmitST([slot0 = so_slot0.Buffer->buffer(), offset = so_slot0.Offset](ArgumentEncodingContext &enc) { - auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE); auto &cmd = enc.encodeRenderCommand(); cmd.type = WMTRenderCommandSetVertexBuffer; cmd.buffer = buffer->buffer();; diff --git a/src/dxmt/dxmt_buffer.cpp b/src/dxmt/dxmt_buffer.cpp index b0cd533b..52638a4c 100644 --- a/src/dxmt/dxmt_buffer.cpp +++ b/src/dxmt/dxmt_buffer.cpp @@ -22,6 +22,10 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info suballocation_count_ = DXMT_PAGE_SIZE / suballocation_size_; info_.length = DXMT_PAGE_SIZE; } + fenceTrackers.reserve(suballocation_count_); + for (auto i = 0u; i < suballocation_count_; i++) { + fenceTrackers.push_back({}); + } if (flags_.test(BufferAllocationFlag::CpuPlaced)) { placed_buffer = wsi::aligned_malloc(info_.length, DXMT_PAGE_SIZE); info_.memory.set(placed_buffer); @@ -29,7 +33,6 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info obj_ = device.newBuffer(info_); gpuAddress_ = info_.gpu_address; mappedMemory_ = info_.memory.get_accessible_or_null(); - depkey = EncoderDepSet::generateNewKey(global_buffer_seq.fetch_add(1)); }; BufferAllocation::~BufferAllocation() { @@ -130,10 +133,7 @@ Buffer::createView(BufferViewDescriptor const &descriptor) { Rc Buffer::allocate(Flags flags) { - WMTResourceOptions options = WMTResourceStorageModeShared; - if (flags.test(BufferAllocationFlag::GpuReadonly)) { - options |= WMTResourceHazardTrackingModeUntracked; - } + WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked; if (flags.test(BufferAllocationFlag::CpuWriteCombined)) { options |= WMTResourceOptionCPUCacheModeWriteCombined; } diff --git a/src/dxmt/dxmt_buffer.hpp b/src/dxmt/dxmt_buffer.hpp index 63f1f3b3..178d3790 100644 --- a/src/dxmt/dxmt_buffer.hpp +++ b/src/dxmt/dxmt_buffer.hpp @@ -88,6 +88,10 @@ class BufferAllocation : public Allocation { return current_suballocation_ * stride; } + uint32_t currentSuballocation() { + return current_suballocation_; + } + void updateContents(uint64_t offset, const void *data, uint64_t length, uint32_t suballocation = 0) noexcept { if (likely(mappedMemory_ != nullptr && !flags_.test(BufferAllocationFlag::GpuManaged))) { @@ -98,7 +102,7 @@ class BufferAllocation : public Allocation { } DXMT_RESOURCE_RESIDENCY_STATE residencyState; - EncoderDepKey depkey; + std::vector fenceTrackers; private: BufferAllocation(WMT::Device device, const WMTBufferInfo &info, Flags flags); diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp index b8608fde..ea4cca03 100644 --- a/src/dxmt/dxmt_context.cpp +++ b/src/dxmt/dxmt_context.cpp @@ -1,6 +1,7 @@ #include "dxmt_context.hpp" #include "Metal.hpp" #include "dxmt_command_queue.hpp" +#include "dxmt_deptrack.hpp" #include "dxmt_format.hpp" #include "dxmt_occlusion_query.hpp" #include "dxmt_presenter.hpp" @@ -41,6 +42,10 @@ ArgumentEncodingContext::ArgumentEncodingContext(CommandQueue &queue, WMT::Devic dummy_cbuffer_ = device.newBuffer(dummy_cbuffer_info_); std::memset(dummy_cbuffer_info_.memory.get(), 0, 65536); cpu_buffer_chunks_.emplace_back(); + + for (unsigned i = 0; i < kParityLane; i++) { + fence_pool_[i] = device.newFence(); + } }; ArgumentEncodingContext::~ArgumentEncodingContext() { @@ -75,7 +80,7 @@ ArgumentEncodingContext::encodeVertexBuffers(uint32_t slot_mask, uint64_t offset continue; } auto valid_length = buffer->length() > state.offset ? buffer->length() - state.offset : 0; - auto [buffer_alloc, buffer_offset] = access(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [buffer_alloc, buffer_offset] = access(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); entries[index].buffer_handle = buffer_alloc->gpuAddress() + buffer_offset + state.offset; entries[index].stride = state.stride; entries[index++].length = valid_length; @@ -139,6 +144,9 @@ void ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *reflection, const MTL_SM50_SHADER_ARGUMENT * constant_buffers, uint64_t offset) { uint64_t *encoded_buffer = getMappedArgumentBuffer(offset); + constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain || + stage == PipelineStage::Hull || stage == PipelineStage::Geometry; + for (unsigned i = 0; i < reflection->NumConstantBuffers; i++) { auto &arg = constant_buffers[i]; auto slot = 14 * unsigned(stage) + arg.SM50BindingSlot; @@ -152,7 +160,7 @@ ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *refl } auto argbuf = cbuf.buffer; auto valid_length = argbuf->length() > cbuf.offset ? argbuf->length() - cbuf.offset : 0; - auto [argbuf_alloc, argbuf_offset] = access(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [argbuf_alloc, argbuf_offset] = access(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = argbuf_alloc->gpuAddress() + argbuf_offset + cbuf.offset; makeResident(argbuf.ptr()); break; @@ -240,6 +248,9 @@ ArgumentEncodingContext::encodeShaderResources( auto &UAVBindingSet = stage == PipelineStage::Compute ? cs_uav_ : om_uav_; + constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain || + stage == PipelineStage::Hull || stage == PipelineStage::Geometry; + for (unsigned i = 0; i < BindingCount; i++) { auto &arg = arguments[i]; switch (arg.Type) { @@ -266,7 +277,7 @@ ArgumentEncodingContext::encodeShaderResources( if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) { if (srv.buffer.ptr()) { - auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = srv_alloc->gpuAddress() + offset + srv.slice.byteOffset; encoded_buffer[arg.StructurePtrOffset + 1] = srv.slice.byteLength; makeResident(srv.buffer.ptr()); @@ -277,7 +288,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) { if (srv.buffer.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET); - auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ); encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id; encoded_buffer[arg.StructurePtrOffset + 1] = ((uint64_t)srv.slice.elementCount << 32) | (uint64_t)(srv.slice.firstElement + offset); @@ -286,7 +297,7 @@ ArgumentEncodingContext::encodeShaderResources( assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP); auto viewIdChecked = srv.texture->checkViewUseArray(srv.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY); encoded_buffer[arg.StructurePtrOffset] = - access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID; + access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID; encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(srv.texture->arrayLength(viewIdChecked), 0); makeResident(srv.texture.ptr(), viewIdChecked); } else { @@ -306,7 +317,7 @@ ArgumentEncodingContext::encodeShaderResources( if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) { if (uav.buffer.ptr()) { - auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags); + auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags); encoded_buffer[arg.StructurePtrOffset] = uav_alloc->gpuAddress() + offset + uav.slice.byteOffset; encoded_buffer[arg.StructurePtrOffset + 1] = uav.slice.byteLength; makeResident(uav.buffer.ptr(), read, write); @@ -317,7 +328,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) { if (uav.buffer.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET); - auto [view, offset] = access(uav.buffer, uav.viewId, access_flags); + auto [view, offset] = access(uav.buffer, uav.viewId, access_flags); encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id; encoded_buffer[arg.StructurePtrOffset + 1] = ((uint64_t)uav.slice.elementCount << 32) | (uint64_t)(uav.slice.firstElement + offset); @@ -325,7 +336,7 @@ ArgumentEncodingContext::encodeShaderResources( } else if (uav.texture.ptr()) { assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP); auto viewIdChecked = uav.texture->checkViewUseArray(uav.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY); - encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpuResourceID; + encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpuResourceID; encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(uav.texture->arrayLength(viewIdChecked), 0); makeResident(uav.texture.ptr(), viewIdChecked, read, write); } else { @@ -335,7 +346,7 @@ ArgumentEncodingContext::encodeShaderResources( } if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_UAV_COUNTER) { if (uav.counter) { - auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE); + auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE); encoded_buffer[arg.StructurePtrOffset + 2] = counter_alloc->gpuAddress() + offset; makeResident(uav.counter.ptr(), true, true); } else { @@ -395,6 +406,8 @@ ArgumentEncodingContext::clearColor(Rc &&texture, unsigned viewId, unsi auto encoder_info = allocate(); encoder_info->type = EncoderType::Clear; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->clear_dsv = 0; encoder_info->color = color; encoder_info->array_length = arrayLength; @@ -417,6 +430,8 @@ ArgumentEncodingContext::clearDepthStencil( auto encoder_info = allocate(); encoder_info->type = EncoderType::Clear; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->clear_dsv = flag & DepthStencilPlanarFlags(texture->pixelFormat()); encoder_info->depth_stencil = {depth, stencil}; encoder_info->array_length = arrayLength; @@ -438,6 +453,9 @@ ArgumentEncodingContext::resolveTexture( assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Resolve; + encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_current = encoder_info; encoder_info->src = access(src, src_view, DXMT_ENCODER_RESOURCE_ACESS_READ); @@ -452,14 +470,14 @@ ArgumentEncodingContext::present(Rc &texture, Rc &presenter, auto encoder_info = allocate(); encoder_info->type = EncoderType::Present; encoder_info->id = nextEncoderId(); - encoder_info->backbuffer = texture->current()->texture(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->presenter = presenter; encoder_info->after = after; encoder_info->metadata = metadata; - encoder_info->tex_read.add(texture->current()->depkey); - encoder_current = encoder_info; + encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; endPass(); } @@ -469,14 +487,13 @@ ArgumentEncodingContext::upscale(Rc &texture, Rc &upscaled, WM auto encoder_info = allocate(); encoder_info->type = EncoderType::SpatialUpscale; encoder_info->id = nextEncoderId(); - encoder_info->backbuffer = texture->current()->texture(); - encoder_info->upscaled = upscaled->current()->texture(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->scaler = scaler; - encoder_info->tex_read.add(texture->current()->depkey); - encoder_info->tex_write.add(upscaled->current()->depkey); - encoder_current = encoder_info; + encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->upscaled = access(upscaled, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; endPass(); } @@ -489,25 +506,19 @@ ArgumentEncodingContext::upscaleTemporal( auto encoder_info = allocate(); encoder_info->type = EncoderType::TemporalUpscale; encoder_info->id = nextEncoderId(); - encoder_info->input = input->current()->texture(); - encoder_info->output = output->current()->texture(); - encoder_info->depth = depth->current()->texture(); - encoder_info->motion_vector = motion_vector->view(mvViewId).texture; + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->scaler = scaler; encoder_info->props = props; - encoder_info->tex_read.add(input->current()->depkey); - encoder_info->tex_read.add(depth->current()->depkey); - encoder_info->tex_read.add(motion_vector->current()->depkey); - encoder_info->tex_write.add(output->current()->depkey); - if(exposure) { - encoder_info->exposure = exposure->current()->texture(); - encoder_info->tex_read.add(exposure->current()->depkey); - } else { - encoder_info->exposure = nullptr; - } - encoder_current = encoder_info; + encoder_info->input = access(input, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->depth = access(depth, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->motion_vector = access(motion_vector, mvViewId, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + encoder_info->output = access(output, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture; + if (exposure) { + encoder_info->exposure = access(exposure, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture; + } endPass(); } @@ -516,7 +527,7 @@ ArgumentEncodingContext::signalEvent(uint64_t value) { assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::SignalEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = queue_.event; encoder_info->value = value; @@ -529,7 +540,7 @@ ArgumentEncodingContext::signalEvent(WMT::Reference &&event, uint64_ assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::SignalEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = std::move(event); encoder_info->value = value; @@ -542,7 +553,7 @@ ArgumentEncodingContext::waitEvent(WMT::Reference &&event, uint64_t assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::WaitForEvent; - encoder_info->id = nextEncoderId(); + encoder_info->id = ~0ull; encoder_info->event = std::move(event); encoder_info->value = value; @@ -557,7 +568,12 @@ ArgumentEncodingContext::startRenderPass( assert(!encoder_current); auto encoder_info = allocate(); encoder_info->type = EncoderType::Render; + encoder_info->encoder_id_vertex = nextEncoderId(); + encoder_info->fence_wait_vertex = {}; + encoder_info->fence_update_vertex = {encoder_info->encoder_id_vertex}; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTRenderCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -583,6 +599,8 @@ ArgumentEncodingContext::startComputePass(uint64_t encoder_argbuf_size) { auto encoder_info = allocate(); encoder_info->type = EncoderType::Compute; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTComputeCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -603,6 +621,8 @@ ArgumentEncodingContext::startBlitPass() { auto encoder_info = allocate(); encoder_info->type = EncoderType::Blit; encoder_info->id = nextEncoderId(); + encoder_info->fence_wait = {}; + encoder_info->fence_update = {encoder_info->id}; encoder_info->cmd_head.type = WMTBlitCommandNop; encoder_info->cmd_head.next.set(0); encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head; @@ -622,6 +642,19 @@ ArgumentEncodingContext::endPass() { if (encoder_current->type == EncoderType::Render) vro_state_.endEncoder(); + if (encoder_current->id != ~0ull) { + if (encoder_current->type == EncoderType::Render) { + auto render_encoder = static_cast(encoder_current); + render_encoder->fence_wait_vertex = + fence_locality_.collectAndSimplifyWaits(render_encoder->fence_wait_vertex, render_encoder->encoder_id_vertex); + encoder_current->fence_wait = + fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id, true); + } else { + encoder_current->fence_wait = + fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id); + } + } + encoder_current = nullptr; encoder_count_++; } @@ -705,9 +738,10 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId if (encoder_count > 1) { unsigned j, i; for (j = encoder_count - 2; j != ~0u; j--) { - if (encoders[j]->type == EncoderType::Null) + // TODO(fences): we don't actively move encoders other than clear and render + if (encoders[j]->type != EncoderType::Clear && encoders[j]->type != EncoderType::Render) continue; - for (i = j + 1; i < std::min(encoder_count, j + kEncoderOptimizerThreshold); i++) { + for (i = j + 1; i < encoder_count; i++) { if (encoders[i]->type == EncoderType::Null) continue; if (checkEncoderRelation(encoders[j], encoders[i]) == DXMT_ENCODER_LIST_OP_SYNCHRONIZE) @@ -783,6 +817,11 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } auto gpu_buffer_ = data->allocated_argbuf; auto encoder = cmdbuf.renderCommandEncoder(render_pass_info); + data->fence_wait.forEach( + data->fence_wait_vertex, // if a fence is waited pre-raster, no need to wait again at fragment + [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStagePreRaster); }, + [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); } + ); encoder.setVertexBuffer(gpu_buffer_, 0, 16); encoder.setVertexBuffer(gpu_buffer_, 0, 29); encoder.setVertexBuffer(gpu_buffer_, 0, 30); @@ -859,6 +898,11 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId ); } encoder.encodeCommands(&data->cmd_head); + data->fence_update_vertex.forEach( + data->fence_update, // if a fence is updated at fragment, no need to update again pre-raster + [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }, + [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStagePreRaster); } + ); encoder.endEncoding(); data->~RenderEncoderData(); break; @@ -866,6 +910,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId case EncoderType::Compute: { auto data = static_cast(current); auto encoder = cmdbuf.computeCommandEncoder(false); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); }); struct wmtcmd_compute_setbuffer setcmd; setcmd.type = WMTComputeCommandSetBuffer; setcmd.next.set(nullptr); @@ -876,6 +921,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId setcmd.index = 30; encoder.encodeCommands((const wmtcmd_compute_nop *)&setcmd); encoder.encodeCommands(&data->cmd_head); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); }); encoder.endEncoding(); data->~ComputeEncoderData(); break; @@ -883,7 +929,9 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId case EncoderType::Blit: { auto data = static_cast(current); auto encoder = cmdbuf.blitCommandEncoder(); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); }); encoder.encodeCommands(&data->cmd_head); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); }); encoder.endEncoding(); data->~BlitEncoderData(); break; @@ -891,7 +939,15 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId case EncoderType::Present: { auto data = static_cast(current); auto t0 = clock::now(); - auto drawable = data->presenter->encodeCommands(cmdbuf, {}, data->backbuffer, data->metadata); + auto drawable = data->presenter->encodeCommands( + cmdbuf, data->backbuffer, data->metadata, + [&](WMT::RenderCommandEncoder encoder) { + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); + }, + [&](WMT::RenderCommandEncoder encoder) { + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); + } + ); auto t1 = clock::now(); currentFrameStatistics().drawable_blocking_interval += (t1 - t0); if (data->after > 0) @@ -930,6 +986,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId info.render_target_array_length = data->array_length; auto encoder = cmdbuf.renderCommandEncoder(info); encoder.setLabel(WMT::String::string("ClearPass", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); encoder.endEncoding(); } data->~ClearEncoderData(); @@ -947,6 +1005,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId auto encoder = cmdbuf.renderCommandEncoder(info); encoder.setLabel(WMT::String::string("ResolvePass", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }); + data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); }); encoder.endEncoding(); } data->~ResolveEncoderData(); @@ -954,7 +1014,15 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } case EncoderType::SpatialUpscale: { auto data = static_cast(current); - cmdbuf.encodeSpatialScale(data->scaler, data->backbuffer, data->upscaled, {}); + auto fence_muxer = cmdbuf.blitCommandEncoder(); + fence_muxer.setLabel(WMT::String::string("FenceMultiplexer", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { fence_muxer.waitForFence(fence_pool_[id]); }); + data->fence_update.forEach([&](auto id) { fence_muxer.updateFence(fence_pool_[id]); }); + fence_muxer.endEncoding(); + data->fence_update.forEach([&](auto id) { + // TODO(fences): we are expecting fence_update contains exactly one fence + cmdbuf.encodeSpatialScale(data->scaler, data->backbuffer, data->upscaled, fence_pool_[id]); + }); data->~SpatialUpscaleData(); break; } @@ -972,7 +1040,18 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId } case EncoderType::TemporalUpscale: { auto data = static_cast(current); - cmdbuf.encodeTemporalScale(data->scaler, data->input, data->output, data->depth, data->motion_vector, data->exposure, {}, data->props); + auto fence_muxer = cmdbuf.blitCommandEncoder(); + fence_muxer.setLabel(WMT::String::string("FenceMultiplexer", WMTUTF8StringEncoding)); + data->fence_wait.forEach([&](auto id) { fence_muxer.waitForFence(fence_pool_[id]); }); + data->fence_update.forEach([&](auto id) { fence_muxer.updateFence(fence_pool_[id]); }); + fence_muxer.endEncoding(); + data->fence_update.forEach([&](auto id) { + // TODO(fences): we are expecting fence_update contains exactly one fence + cmdbuf.encodeTemporalScale( + data->scaler, data->input, data->output, data->depth, data->motion_vector, data->exposure, fence_pool_[id], + data->props + ); + }); data->~TemporalUpscaleData(); break; } @@ -998,6 +1077,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId DXMT_ENCODER_LIST_OP ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *latter) { + if (former->type == EncoderType::Null) return DXMT_ENCODER_LIST_OP_SWAP; if (latter->type == EncoderType::Null) @@ -1025,7 +1105,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * depth_attachment->clear_depth = clear->depth_stencil.first; depth_attachment->load_action = WMTLoadActionClear; depth_attachment->store_action = WMTStoreActionStore; - render->tex_write.merge(clear->tex_write); + render->fence_update.merge(clear->fence_update); + render->fence_wait.merge(clear->fence_wait); + render->fence_wait.subtract(clear->fence_update); } clear->clear_dsv &= ~1; } @@ -1034,7 +1116,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * stencil_attachment->clear_stencil = clear->depth_stencil.second; stencil_attachment->load_action = WMTLoadActionClear; stencil_attachment->store_action = WMTStoreActionStore; - render->tex_write.merge(clear->tex_write); + render->fence_update.merge(clear->fence_update); + render->fence_wait.merge(clear->fence_wait); + render->fence_wait.subtract(clear->fence_update); } clear->clear_dsv &= ~2; } @@ -1050,8 +1134,11 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (attachment->load_action == WMTLoadActionLoad) { attachment->load_action = WMTLoadActionClear; attachment->clear_color = clear->color; - if (attachment->store_action != WMTStoreActionDontCare) - render->tex_write.merge(clear->tex_write); + if (attachment->store_action != WMTStoreActionDontCare) { + render->fence_update.merge(clear->fence_update); + render->fence_wait.merge(clear->fence_wait); + render->fence_wait.subtract(clear->fence_update); + } } currentFrameStatistics().clear_pass_optimized++; @@ -1081,7 +1168,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * if (result.src) { result.src->store_action = WMTStoreActionStoreAndMultisampleResolve; result.src->resolve_attachment = result.dst; - render->tex_write.merge(resolve->tex_write); + render->fence_update.merge(resolve->fence_update); + render->fence_wait.merge(resolve->fence_wait); + render->fence_wait.subtract(resolve->fence_update); currentFrameStatistics().resolve_pass_optimized++; resolve->~ResolveEncoderData(); @@ -1097,7 +1186,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * auto r1 = reinterpret_cast(latter); auto r0 = reinterpret_cast(former); - if (isEncoderSignatureMatched(r0, r1)) { + if (isEncoderSignatureMatched(r0, r1) && + // can't merge if latter's vertex wait for former's fragment + !r1->fence_wait_vertex.intersectedWith(r0->fence_update)) { for (unsigned i = 0; i < r0->render_target_count; i++) { auto &a0 = r0->colors[i]; auto &a1 = r1->colors[i]; @@ -1134,10 +1225,19 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * r1->ts_arg_marshal_tasks = std::move(r0->ts_arg_marshal_tasks); r1->use_visibility_result = r0->use_visibility_result || r1->use_visibility_result; - r1->buf_read.merge(r0->buf_read); - r1->buf_write.merge(r0->buf_write); - r1->tex_read.merge(r0->tex_read); - r1->tex_write.merge(r0->tex_write); + r1->fence_update.merge(r0->fence_update); + r1->fence_wait.merge(r0->fence_wait); + r1->fence_wait.subtract(r0->fence_update); + r1->fence_update_vertex.merge(r0->fence_update_vertex); + r1->fence_wait_vertex.merge(r0->fence_wait_vertex); + r1->fence_wait_vertex.subtract(r0->fence_update_vertex); + + // just in case + r1->fence_wait.subtract(r0->fence_update_vertex); + /* + r1->fence_wait_vertex.subtract(r0->fence_update); + does not make sense + */ currentFrameStatistics().render_pass_optimized++; r0->~RenderEncoderData(); @@ -1153,26 +1253,26 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData * bool ArgumentEncodingContext::hasDataDependency(EncoderData *latter, EncoderData *former) { - if (latter->type == EncoderType::Clear && former->type == EncoderType::Clear) { - // FIXME: prove it's safe to return false - return false; + if (former->type == EncoderType::Render) { + auto r0 = reinterpret_cast(former); + FenceSet fence_wait_r0 = r0->fence_wait.unionOf(r0->fence_wait_vertex); + FenceSet fence_update_r0 = r0->fence_update_vertex.unionOf(r0->fence_update); + if (latter->type == EncoderType::Render) { + auto r1 = reinterpret_cast(latter); + FenceSet fence_wait_r1 = r1->fence_wait.unionOf(r1->fence_wait_vertex); + FenceSet fence_update_r1 = r1->fence_update_vertex.unionOf(r1->fence_update); + return fence_update_r0.intersectedWith(fence_wait_r1) || fence_update_r1.intersectedWith(fence_wait_r0); + } + return fence_update_r0.intersectedWith(latter->fence_wait) || latter->fence_update.intersectedWith(fence_wait_r0); + } + if (latter->type == EncoderType::Render) { + auto r1 = reinterpret_cast(latter); + FenceSet fence_wait = r1->fence_wait.unionOf(r1->fence_wait_vertex); + FenceSet fence_update = r1->fence_update_vertex.unionOf(r1->fence_update); + return former->fence_update.intersectedWith(fence_wait) || fence_update.intersectedWith(former->fence_wait); } - // read-after-write - if (!former->buf_write.isDisjointWith(latter->buf_read)) - return true; - if (!former->tex_write.isDisjointWith(latter->tex_read)) - return true; - // write-after-write - if (!former->buf_write.isDisjointWith(latter->buf_write)) - return true; - if (!former->tex_write.isDisjointWith(latter->tex_write)) - return true; - // write-after-read - if (!former->buf_read.isDisjointWith(latter->buf_write)) - return true; - if (!former->tex_read.isDisjointWith(latter->tex_write)) - return true; - return false; + return former->fence_update.intersectedWith(latter->fence_wait) || + latter->fence_update.intersectedWith(former->fence_wait); } bool diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp index 63f35e0e..e5ad1718 100644 --- a/src/dxmt/dxmt_context.hpp +++ b/src/dxmt/dxmt_context.hpp @@ -93,11 +93,9 @@ enum class EncoderType { struct EncoderData { EncoderType type; EncoderData *next = nullptr; - uint64_t id; - EncoderDepSet buf_read; - EncoderDepSet buf_write; - EncoderDepSet tex_read; - EncoderDepSet tex_write; + EncoderId id; + FenceSet fence_wait; + FenceSet fence_update; }; struct GSDispatchArgumentsMarshal { @@ -167,6 +165,9 @@ struct RenderEncoderData : EncoderData { wmtcmd_base *cmd_tail; WMT::Buffer allocated_argbuf; uint64_t allocated_argbuf_offset; + uint64_t encoder_id_vertex; + FenceSet fence_wait_vertex; + FenceSet fence_update_vertex; void *allocated_argbuf_mapping; uint8_t dsv_planar_flags; uint8_t dsv_readonly_flags; @@ -283,12 +284,6 @@ enum DXMT_ENCODER_LIST_OP { class CommandQueue; -enum DXMT_ENCODER_RESOURCE_ACESS { - DXMT_ENCODER_RESOURCE_ACESS_READ = 1 <<0, - DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1, - DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE, -}; - struct AllocatedTempBufferSlice { WMT::Buffer gpu_buffer; uint64_t offset; @@ -296,55 +291,83 @@ struct AllocatedTempBufferSlice { }; class ArgumentEncodingContext { + template void trackBuffer(BufferAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) { retainAllocation(allocation); if (allocation->flags().test(BufferAllocationFlag::GpuReadonly)) return; - if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ) - encoder_current->buf_read.add(allocation->depkey); + auto &tracker = allocation->fenceTrackers[allocation->currentSuballocation()]; + if constexpr (PreRasterStage) { + auto current_encoder = currentRenderEncoder(); + auto id = current_encoder->encoder_id_vertex; + if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) + tracker.write(id, current_encoder->fence_wait_vertex); + else + tracker.read(id, current_encoder->fence_wait_vertex); + return; + } + auto current_encoder = currentEncoder(); if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) - encoder_current->buf_write.add(allocation->depkey); + tracker.write(currentEncoderId(), current_encoder->fence_wait); + else + tracker.read(currentEncoderId(), current_encoder->fence_wait); } + template void trackTexture(TextureAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) { retainAllocation(allocation); if (allocation->flags().test(TextureAllocationFlag::GpuReadonly)) return; - if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ) - encoder_current->tex_read.add(allocation->depkey); + auto &tracker = allocation->fenceTracker; + if constexpr (PreRasterStage) { + auto current_encoder = currentRenderEncoder(); + auto id = current_encoder->encoder_id_vertex; + if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) + tracker.write(id, current_encoder->fence_wait_vertex); + else + tracker.read(id, current_encoder->fence_wait_vertex); + return; + } + auto current_encoder = currentEncoder(); if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE) - encoder_current->tex_write.add(allocation->depkey); + tracker.write(currentEncoderId(), current_encoder->fence_wait); + else + tracker.read(currentEncoderId(), current_encoder->fence_wait); } public: + template std::pair access(Rc const &buffer, unsigned offset, unsigned length, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = buffer->current(); - trackBuffer(allocation, flags); + trackBuffer(allocation, flags); return {allocation, allocation->currentSuballocationOffset()}; } + template std::pair access(Rc const &buffer, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = buffer->current(); - trackBuffer(allocation, flags); + trackBuffer(allocation, flags); auto &view = buffer->view_(viewId, allocation); return {view, allocation->currentSuballocationOffset(view.suballocation_texel)}; } + template WMT::Texture access(Rc const &texture, unsigned level, unsigned slice, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = texture->current(); - trackTexture(allocation, flags); + trackTexture(allocation, flags); return allocation->texture(); } + template TextureView & access(Rc const &texture, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) { auto allocation = texture->current(); - trackTexture(allocation, flags); + trackTexture(allocation, flags); return texture->view(viewId, allocation); } @@ -425,7 +448,7 @@ class ArgumentEncodingContext { std::pair currentIndexBuffer() { // because of indirect draw, we can't predicate the accessed buffer range - auto [ibuf_alloc, offset] = access(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ); + auto [ibuf_alloc, offset] = access(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ); return {ibuf_alloc->buffer(), offset}; }; @@ -586,8 +609,7 @@ class ArgumentEncodingContext { uint64_t nextEncoderId() { - static std::atomic_uint64_t global_id = 0; - return global_id.fetch_add(1); + return encoder_id_++; }; void clearColor(Rc &&texture, unsigned viewId, unsigned arrayLength, WMTClearColor color); @@ -610,9 +632,15 @@ class ArgumentEncodingContext { return encoder_current; } + constexpr uint64_t + currentEncoderId() { + assert(encoder_current); + return encoder_current->id; + } + constexpr RenderEncoderData * currentRenderEncoder() { - assert(encoder_current->type == EncoderType::Render); + assert(encoder_current && encoder_current->type == EncoderType::Render); return static_cast(encoder_current); } @@ -750,10 +778,14 @@ class ArgumentEncodingContext { void *dummy_cbuffer_host_; WMTBufferInfo dummy_cbuffer_info_; - EncoderData encoder_head = {EncoderType::Null, nullptr}; + EncoderData encoder_head = {EncoderType::Null, nullptr, ~0ull}; EncoderData *encoder_last = &encoder_head; EncoderData *encoder_current = nullptr; unsigned encoder_count_ = 0; + + uint64_t encoder_id_ = kParityLane; // actually important to not start from 0 + std::array, kParityLane> fence_pool_; + FenceLocalityCheck fence_locality_; uint64_t seq_id_; uint64_t frame_id_; diff --git a/src/dxmt/dxmt_deptrack.cpp b/src/dxmt/dxmt_deptrack.cpp new file mode 100644 index 00000000..8b3fe3e9 --- /dev/null +++ b/src/dxmt/dxmt_deptrack.cpp @@ -0,0 +1,77 @@ +#include "dxmt_deptrack.hpp" +#include + +namespace dxmt { + +void +GenericAccessTracker::read(EncoderId id, FenceSet &wait_fences) { + if (read_.add(id)) { + write_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + } +} + +void +GenericAccessTracker::write(EncoderId id, FenceSet &wait_fences) { + if (write_.check(id)) { + write_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + read_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); }); + read_.clear(); + write_.clearAndAdd(id); + } +} + +class WeakFenceMaskLTO { +public: + constexpr WeakFenceMaskLTO() { + int i = 0; + for (int p = 0; p < kParity; ++p) { + for (int l = 0; l < kLane; ++l) { + weak_fences_lto[i++].fillGenerationBefore(p, l); + } + } + } + + const FenceSet & + operator[](EncoderId i) const { + return weak_fences_lto[i % kParityLane]; + } + +private: + FenceSet weak_fences_lto[kParityLane]; +}; + +constexpr auto WEAK_FENCE_MASK = WeakFenceMaskLTO(); + +FenceSet +FenceLocalityCheck::collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait) { + if (implicit_pre_raster_wait) + strong_fences.set(id - 1); + + FenceSet full_fences(strong_fences); + full_fences.mergeWithLaneMaskOff(WEAK_FENCE_MASK[id], strong_fences.laneMask()); + + FenceSet minimal_fences; + FenceSet accessible_fences; + + constexpr auto start_offset = kParityLane == 1 ? 0 : 1; + + for (auto offset = start_offset; offset < kParityLane; offset++) { + EncoderId prev_encoder_id = id - offset; + + if (full_fences.test(prev_encoder_id) && !accessible_fences.testAndSet(prev_encoder_id)) + minimal_fences.set(prev_encoder_id); + if (accessible_fences.test(prev_encoder_id)) + accessible_fences.merge(summary_[prev_encoder_id % kParityLane]); + if (accessible_fences.contains(full_fences)) + break; + } + + summary_[id % kParityLane] = full_fences; + + if (implicit_pre_raster_wait) + minimal_fences.unset(id - 1); + + return minimal_fences; +} + +} // namespace dxmt \ No newline at end of file diff --git a/src/dxmt/dxmt_deptrack.hpp b/src/dxmt/dxmt_deptrack.hpp index be001a85..e6eefccc 100644 --- a/src/dxmt/dxmt_deptrack.hpp +++ b/src/dxmt/dxmt_deptrack.hpp @@ -1,7 +1,297 @@ #pragma once -#include "util_bloom.hpp" +#include +#include +#include +#include namespace dxmt { -using EncoderDepSet = PartitionedBloomFilter64<16>; -using EncoderDepKey = EncoderDepSet::Key; + +enum DXMT_ENCODER_RESOURCE_ACESS { + DXMT_ENCODER_RESOURCE_ACESS_READ = 1 << 0, + DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1, + DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE, +}; + +constexpr auto kLog2Lane = 6ull; +constexpr auto kLane = 1 << kLog2Lane; +constexpr auto kLaneMask = kLane - 1; +constexpr auto kAllLaneMask = ~0ull >> (64 /* uint64_t */ - kLane); +constexpr auto kParity = 4; // can also use 3, although power of 2 is nice +constexpr auto kParityLane = kParity * kLane; + +static_assert(kLog2Lane <= 6); +static_assert(kLane > 1); + +using LaneStorage = uint64_t; +using EncoderId = uint64_t; + +constexpr auto +PARITY(EncoderId id) { + return (id >> kLog2Lane) % kParity; +} + +constexpr auto +LANE(EncoderId id) { + return id & kLaneMask; +} + +class FenceSet { +public: + constexpr FenceSet() { + for (int i = 0; i < kParity; i++) { + storage_[i] = 0; + } + } + + constexpr FenceSet(EncoderId id) { + for (int i = 0; i < kParity; i++) { + storage_[i] = 0; + } + set(id); + } + + FenceSet(const FenceSet ©) { + memcpy(&storage_, ©.storage_, sizeof(storage_)); + } + + FenceSet & + operator=(const dxmt::FenceSet ©) { + memcpy(&storage_, ©.storage_, sizeof(storage_)); + return *this; + } + + ~FenceSet() = default; + + constexpr void + set(EncoderId id) { + storage_[PARITY(id)] |= (1ull << LANE(id)); + } + + constexpr void + unset(EncoderId id) { + storage_[PARITY(id)] &= (kAllLaneMask & ~(1ull << LANE(id))); + } + + constexpr void + fillGenerationBefore(int parity, int lane) { + const int idx = (parity + kParity + (kParity - 1)) * kLane + lane; + for (int offset = 0; offset < kLane; ++offset) { + set(idx - offset); + } + } + + constexpr bool + test(EncoderId id) const { + return storage_[PARITY(id)] & (1ull << LANE(id)); + } + + constexpr bool + testAndSet(EncoderId id) { + auto P = PARITY(id); + auto LM = 1ull << LANE(id); + if (storage_[P] & LM) + return true; + storage_[P] |= LM; + return false; + } + + FenceSet & + intersect(const FenceSet &set) { + for (int i = 0; i < kParity; i++) { + storage_[i] &= set.storage_[i]; + } + return *this; + } + + constexpr bool + intersectedWith(const FenceSet &set) const { + for (int i = 0; i < kParity; i++) { + if (storage_[i] & set.storage_[i]) + return true; + } + return false; + } + + constexpr bool + contains(const FenceSet &set) const { + for (int i = 0; i < kParity; i++) { + if ((storage_[i] & set.storage_[i]) != set.storage_[i]) + return false; + } + return true; + } + + FenceSet & + merge(const FenceSet &set) { + for (int i = 0; i < kParity; i++) { + storage_[i] |= set.storage_[i]; + } + return *this; + } + + FenceSet + unionOf(const FenceSet &set) const { + FenceSet ret{}; + for (int i = 0; i < kParity; i++) { + ret.storage_[i] = storage_[i] | set.storage_[i]; + } + return ret; + } + + FenceSet & + subtract(const FenceSet &set) { + for (int i = 0; i < kParity; i++) { + storage_[i] &= (kAllLaneMask & ~set.storage_[i]); + } + return *this; + } + + FenceSet & + mergeWithLaneMaskOff(const FenceSet &set, const LaneStorage &mask) { + for (int i = 0; i < kParity; i++) { + storage_[i] |= (set.storage_[i] & (kAllLaneMask & ~mask)); + } + return *this; + } + + LaneStorage + laneMask() const { + LaneStorage ret = 0; + for (int i = 0; i < kParity; i++) { + ret |= storage_[i]; + } + return ret; + } + + template + void + forEach(Fn &&fn) { + static_assert(kParity == 4); + for (int P = 0; P < kParity; P++) { + auto lanes = storage_[P]; + if (lanes == 0) + continue; + for (int L = 0; L < kLane; L++) { + if (lanes & (1ull << L)) { + fn(P * kLane + L); + } + } + } + + // for (int i = 0; i < kParityLane; i++) { + // if (test(i)) { + // fn(i); + // } + // } + } + + template + void + forEach(const FenceSet &prior, FnPrior &&fnPrior, Fn &&fn) { + static_assert(kParity == 4); + for (int P = 0; P < kParity; P++) { + auto lanes = storage_[P]; + auto lanes_prior = prior.storage_[P]; + if ((lanes | lanes_prior) == 0) + continue; + for (int L = 0; L < kLane; L++) { + if (lanes_prior & (1ull << L)) { + fnPrior(P * kLane + L); + } else if (lanes & (1ull << L)) { + fn(P * kLane + L); + } + } + } + + // for (int i = 0; i < kParityLane; i++) { + // if (prior.test(i)) { + // fnPrior(i); + // } else if (test(i)) { + // fn(i); + // } + // } + } + +private: + LaneStorage storage_[kParity]; +}; + +class TrackingSet { +public: + TrackingSet() { + cursor = 0; + clear(); + }; + + bool + add(EncoderId id) { + assert(storage_[cursor] <= id); + if (storage_[cursor] == id) + return false; + bumpCursor(); + storage_[cursor] = id; + return true; + }; + + bool + check(EncoderId id) { + return storage_[cursor] != id; + } + + void + clear() { + storage_[cursor] = 0; + }; + + void + clearAndAdd(EncoderId id) { + storage_[cursor] = 0; + bumpCursor(); + storage_[cursor] = id; + }; + + template + void + enumerate(EncoderId id, Fn &&fn) { + for (int i = 0; i < kLane; i++) { + auto c = storage_[(cursor + kLane - i) & kLaneMask]; + if (c == id) + continue; + if (c > (id - kLane)) { + fn(c); + continue; + } + break; + } + } + +private: + void + bumpCursor() { + cursor++; + cursor &= kLaneMask; + } + + EncoderId storage_[kLane]; + uint32_t cursor; +}; + +class GenericAccessTracker { +public: + void read(EncoderId id, FenceSet &); + void write(EncoderId id, FenceSet &); + +private: + TrackingSet read_; + TrackingSet write_; +}; + +class FenceLocalityCheck { +public: + FenceSet collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait = false); + +private: + std::array summary_; +}; + } // namespace dxmt diff --git a/src/dxmt/dxmt_presenter.cpp b/src/dxmt/dxmt_presenter.cpp index 1bd4fcef..f1da2b0f 100644 --- a/src/dxmt/dxmt_presenter.cpp +++ b/src/dxmt/dxmt_presenter.cpp @@ -115,7 +115,9 @@ Presenter::synchronizeLayerProperties() { WMT::MetalDrawable Presenter::encodeCommands( - WMT::CommandBuffer cmdbuf, WMT::Fence fence, WMT::Texture backbuffer, DXMTPresentMetadata metadata + WMT::CommandBuffer cmdbuf, WMT::Texture backbuffer, DXMTPresentMetadata metadata, + std::function &&wait_fences, + std::function &&update_fences ) { auto drawable = layer_.nextDrawable(); @@ -125,8 +127,7 @@ Presenter::encodeCommands( info.colors[0].store_action = WMTStoreActionStore; info.colors[0].texture = drawable.texture(); auto encoder = cmdbuf.renderCommandEncoder(info); - if (fence) - encoder.waitForFence(fence, WMTRenderStageFragment); + wait_fences(encoder); encoder.setFragmentTexture(backbuffer, 0); double width = layer_props_.drawable_width; @@ -140,7 +141,7 @@ Presenter::encodeCommands( } encoder.setViewport({0, 0, width, height, 0, 1}); encoder.drawPrimitives(WMTPrimitiveTypeTriangle, 0, 3); - + update_fences(encoder); encoder.endEncoding(); return drawable; diff --git a/src/dxmt/dxmt_presenter.hpp b/src/dxmt/dxmt_presenter.hpp index 08171f56..de29249d 100644 --- a/src/dxmt/dxmt_presenter.hpp +++ b/src/dxmt/dxmt_presenter.hpp @@ -54,8 +54,11 @@ class Presenter : public RcObject { PresentState synchronizeLayerProperties(); - WMT::MetalDrawable - encodeCommands(WMT::CommandBuffer cmdbuf, WMT::Fence fence, WMT::Texture backbuffer, DXMTPresentMetadata metadata); + WMT::MetalDrawable encodeCommands( + WMT::CommandBuffer cmdbuf, WMT::Texture backbuffer, DXMTPresentMetadata metadata, + std::function &&wait_fences, + std::function &&update_fences + ); private: void buildRenderPipelineState(bool is_pq, bool with_hdr_metadata, bool is_ms); diff --git a/src/dxmt/dxmt_texture.cpp b/src/dxmt/dxmt_texture.cpp index bf79e19c..d508a783 100644 --- a/src/dxmt/dxmt_texture.cpp +++ b/src/dxmt/dxmt_texture.cpp @@ -50,7 +50,6 @@ TextureAllocation::TextureAllocation( gpuResourceID = info_copy.gpu_resource_id; machPort = 0; - depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1)); }; TextureAllocation::TextureAllocation( @@ -63,7 +62,6 @@ TextureAllocation::TextureAllocation( mappedMemory = nullptr; gpuResourceID = textureDescriptor.gpu_resource_id; machPort = textureDescriptor.mach_port; - depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1)); }; TextureAllocation::~TextureAllocation(){ @@ -159,12 +157,9 @@ Texture::Texture( Rc Texture::allocate(Flags flags) { - WMTResourceOptions options = WMTResourceStorageModeShared; + WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked; WMTTextureInfo info = info_; // copy info.mach_port = 0; - if (flags.test(TextureAllocationFlag::GpuReadonly)) { - options |= WMTResourceHazardTrackingModeUntracked; - } if (flags.test(TextureAllocationFlag::CpuWriteCombined)) { options |= WMTResourceOptionCPUCacheModeWriteCombined; } diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp index ccf28379..bfe3cbf5 100644 --- a/src/dxmt/dxmt_texture.hpp +++ b/src/dxmt/dxmt_texture.hpp @@ -98,7 +98,7 @@ class TextureAllocation : public Allocation { void *mappedMemory; uint64_t gpuResourceID; mach_port_t machPort; - EncoderDepKey depkey; + GenericAccessTracker fenceTracker; private: TextureAllocation( diff --git a/src/dxmt/meson.build b/src/dxmt/meson.build index 0eac0d64..6cb08d99 100644 --- a/src/dxmt/meson.build +++ b/src/dxmt/meson.build @@ -17,6 +17,7 @@ dxmt_src = [ 'dxmt_sampler.cpp', 'dxmt_resource_initializer.cpp', 'dxmt_shader_cache.cpp', + 'dxmt_deptrack.cpp', ] dxmt_shaders = [ diff --git a/src/winemetal/winemetal.h b/src/winemetal/winemetal.h index 767f84ca..0e777416 100644 --- a/src/winemetal/winemetal.h +++ b/src/winemetal/winemetal.h @@ -1113,6 +1113,7 @@ enum WMTRenderStages : uint8_t { WMTRenderStageTile = 4, WMTRenderStageObject = 8, WMTRenderStageMesh = 16, + WMTRenderStagePreRaster = WMTRenderStageVertex | WMTRenderStageObject | WMTRenderStageMesh, }; struct wmtcmd_render_useresource {