diff --git a/src/d3d11/d3d11_context_impl.cpp b/src/d3d11/d3d11_context_impl.cpp
index 490d6807..7042ecd6 100644
--- a/src/d3d11/d3d11_context_impl.cpp
+++ b/src/d3d11/d3d11_context_impl.cpp
@@ -1561,7 +1561,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
     if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
       EmitOP([IndexType, IndexBufferOffset, Primitive, ArgBuffer = bindable->buffer(),
             AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) {
-        auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
+        auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
         enc.bumpVisibilityResultOffset();
         auto [index_buffer, index_sub_offset] = enc.currentIndexBuffer();
         auto &cmd = enc.encodeRenderCommand<wmtcmd_render_draw_indexed_indirect>();
@@ -1596,7 +1596,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
     }
     if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
       EmitOP([Primitive, ArgBuffer = bindable->buffer(), AlignedByteOffsetForArgs](ArgumentEncodingContext &enc) {
-        auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
+        auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
         enc.bumpVisibilityResultOffset();
         auto &cmd = enc.encodeRenderCommand<wmtcmd_render_draw_indirect>();
         cmd.type = WMTRenderCommandDrawIndirect;
@@ -1614,7 +1614,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
     auto max_object_threadgroups = max_object_threadgroups_;
     if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
       EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
-        auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
+        auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
         auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
   
         auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo);
@@ -1645,7 +1645,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
 
     if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
       EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
-        auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
+        auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
         auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
   
         auto [vertex_per_warp, vertex_increment_per_wrap] = get_gs_vertex_count(topo);
@@ -1677,7 +1677,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
     auto max_object_threadgroups = max_object_threadgroups_;
     if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
       EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
-        auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
+        auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
         auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
   
         auto PatchPerGroup = 32 / enc.tess_threads_per_patch;
@@ -1711,7 +1711,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
 
     if (auto bindable = reinterpret_cast<D3D11ResourceCommon *>(pBufferForArgs)) {
       EmitOP([=, topo = state_.InputAssembler.Topology, ArgBuffer = bindable->buffer()](ArgumentEncodingContext &enc) {
-        auto [buffer, buffer_offset] = enc.access(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
+        auto [buffer, buffer_offset] = enc.access<true>(ArgBuffer, AlignedByteOffsetForArgs, 20, DXMT_ENCODER_RESOURCE_ACESS_READ);
         auto dispatch_arg = enc.allocateTempBuffer1(sizeof(DXMT_DISPATCH_ARGUMENTS), 4);
   
         auto PatchPerGroup = 32 / enc.tess_threads_per_patch;
@@ -4668,7 +4668,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
       auto &so_slot0 = state_.StreamOutput.Targets[0];
       if (so_slot0.Offset == 0xFFFFFFFF) {
         EmitST([slot0 = so_slot0.Buffer->buffer()](ArgumentEncodingContext &enc) {
-          auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
+          auto [buffer, buffer_offset] = enc.access<true>(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
           auto &cmd = enc.encodeRenderCommand<wmtcmd_render_setbuffer>();
           cmd.type = WMTRenderCommandSetVertexBuffer;
           cmd.buffer = buffer->buffer();;
@@ -4679,7 +4679,7 @@ template <typename ContextInternalState> class MTLD3D11DeviceContextImplBase : p
         });
       } else {
         EmitST([slot0 = so_slot0.Buffer->buffer(), offset = so_slot0.Offset](ArgumentEncodingContext &enc) {
-          auto [buffer, buffer_offset] = enc.access(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
+          auto [buffer, buffer_offset] = enc.access<true>(slot0, 0, slot0->length(), DXMT_ENCODER_RESOURCE_ACESS_WRITE);
           auto &cmd = enc.encodeRenderCommand<wmtcmd_render_setbuffer>();
           cmd.type = WMTRenderCommandSetVertexBuffer;
           cmd.buffer = buffer->buffer();;
diff --git a/src/dxmt/dxmt_buffer.cpp b/src/dxmt/dxmt_buffer.cpp
index b0cd533b..52638a4c 100644
--- a/src/dxmt/dxmt_buffer.cpp
+++ b/src/dxmt/dxmt_buffer.cpp
@@ -22,6 +22,10 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info
     suballocation_count_ = DXMT_PAGE_SIZE / suballocation_size_;
     info_.length = DXMT_PAGE_SIZE;
   }
+  fenceTrackers.reserve(suballocation_count_);
+  for (auto i = 0u; i < suballocation_count_; i++) {
+    fenceTrackers.push_back({});
+  }
   if (flags_.test(BufferAllocationFlag::CpuPlaced)) {
     placed_buffer = wsi::aligned_malloc(info_.length, DXMT_PAGE_SIZE);
     info_.memory.set(placed_buffer);
@@ -29,7 +33,6 @@ BufferAllocation::BufferAllocation(WMT::Device device, const WMTBufferInfo &info
   obj_ = device.newBuffer(info_);
   gpuAddress_ = info_.gpu_address;
   mappedMemory_ = info_.memory.get_accessible_or_null();
-  depkey = EncoderDepSet::generateNewKey(global_buffer_seq.fetch_add(1));
 };
 
 BufferAllocation::~BufferAllocation() {
@@ -130,10 +133,7 @@ Buffer::createView(BufferViewDescriptor const &descriptor) {
 
 Rc<BufferAllocation>
 Buffer::allocate(Flags<BufferAllocationFlag> flags) {
-  WMTResourceOptions options = WMTResourceStorageModeShared;
-  if (flags.test(BufferAllocationFlag::GpuReadonly)) {
-    options |= WMTResourceHazardTrackingModeUntracked;
-  }
+  WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked;
   if (flags.test(BufferAllocationFlag::CpuWriteCombined)) {
     options |= WMTResourceOptionCPUCacheModeWriteCombined;
   }
diff --git a/src/dxmt/dxmt_buffer.hpp b/src/dxmt/dxmt_buffer.hpp
index 63f1f3b3..178d3790 100644
--- a/src/dxmt/dxmt_buffer.hpp
+++ b/src/dxmt/dxmt_buffer.hpp
@@ -88,6 +88,10 @@ class BufferAllocation : public Allocation {
     return current_suballocation_ * stride;
   }
 
+  uint32_t currentSuballocation() {
+    return current_suballocation_;
+  }
+
   void
   updateContents(uint64_t offset, const void *data, uint64_t length, uint32_t suballocation = 0) noexcept {
     if (likely(mappedMemory_ != nullptr && !flags_.test(BufferAllocationFlag::GpuManaged))) {
@@ -98,7 +102,7 @@ class BufferAllocation : public Allocation {
   }
 
   DXMT_RESOURCE_RESIDENCY_STATE residencyState;
-  EncoderDepKey depkey;
+  std::vector<GenericAccessTracker> fenceTrackers;
 
 private:
   BufferAllocation(WMT::Device device, const WMTBufferInfo &info, Flags<BufferAllocationFlag> flags);
diff --git a/src/dxmt/dxmt_context.cpp b/src/dxmt/dxmt_context.cpp
index b8608fde..ea4cca03 100644
--- a/src/dxmt/dxmt_context.cpp
+++ b/src/dxmt/dxmt_context.cpp
@@ -1,6 +1,7 @@
 #include "dxmt_context.hpp"
 #include "Metal.hpp"
 #include "dxmt_command_queue.hpp"
+#include "dxmt_deptrack.hpp"
 #include "dxmt_format.hpp"
 #include "dxmt_occlusion_query.hpp"
 #include "dxmt_presenter.hpp"
@@ -41,6 +42,10 @@ ArgumentEncodingContext::ArgumentEncodingContext(CommandQueue &queue, WMT::Devic
   dummy_cbuffer_ = device.newBuffer(dummy_cbuffer_info_);
   std::memset(dummy_cbuffer_info_.memory.get(), 0, 65536);
   cpu_buffer_chunks_.emplace_back();
+
+  for (unsigned i = 0; i < kParityLane; i++) {
+    fence_pool_[i] = device.newFence();
+  }
 };
 
 ArgumentEncodingContext::~ArgumentEncodingContext() {
@@ -75,7 +80,7 @@ ArgumentEncodingContext::encodeVertexBuffers(uint32_t slot_mask, uint64_t offset
       continue;
     }
     auto valid_length = buffer->length() > state.offset ? buffer->length() - state.offset : 0;
-    auto [buffer_alloc, buffer_offset] = access(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ);
+    auto [buffer_alloc, buffer_offset] = access<true>(buffer, state.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ);
     entries[index].buffer_handle = buffer_alloc->gpuAddress() + buffer_offset + state.offset;
     entries[index].stride = state.stride;
     entries[index++].length = valid_length;
@@ -139,6 +144,9 @@ void
 ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *reflection, const MTL_SM50_SHADER_ARGUMENT * constant_buffers, uint64_t offset) {
   uint64_t *encoded_buffer = getMappedArgumentBuffer<uint64_t, stage == PipelineStage::Compute>(offset);
 
+  constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain ||
+                                 stage == PipelineStage::Hull || stage == PipelineStage::Geometry;
+
   for (unsigned i = 0; i < reflection->NumConstantBuffers; i++) {
     auto &arg = constant_buffers[i];
     auto slot = 14 * unsigned(stage) + arg.SM50BindingSlot;
@@ -152,7 +160,7 @@ ArgumentEncodingContext::encodeConstantBuffers(const MTL_SHADER_REFLECTION *refl
       }
       auto argbuf = cbuf.buffer;
       auto valid_length = argbuf->length() > cbuf.offset ? argbuf->length() - cbuf.offset : 0;
-      auto [argbuf_alloc, argbuf_offset] = access(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ);
+      auto [argbuf_alloc, argbuf_offset] = access<PreRasterStage>(argbuf, cbuf.offset, valid_length, DXMT_ENCODER_RESOURCE_ACESS_READ);
       encoded_buffer[arg.StructurePtrOffset] = argbuf_alloc->gpuAddress() + argbuf_offset + cbuf.offset;
       makeResident<stage, kind>(argbuf.ptr());
       break;
@@ -240,6 +248,9 @@ ArgumentEncodingContext::encodeShaderResources(
 
   auto &UAVBindingSet = stage == PipelineStage::Compute ? cs_uav_ : om_uav_;
 
+  constexpr bool PreRasterStage = stage == PipelineStage::Vertex || stage == PipelineStage::Domain ||
+                                 stage == PipelineStage::Hull || stage == PipelineStage::Geometry;
+
   for (unsigned i = 0; i < BindingCount; i++) {
     auto &arg = arguments[i];
     switch (arg.Type) {
@@ -266,7 +277,7 @@ ArgumentEncodingContext::encodeShaderResources(
 
       if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) {
         if (srv.buffer.ptr()) {
-          auto [srv_alloc, offset] = access(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ);
+          auto [srv_alloc, offset] = access<PreRasterStage>(srv.buffer, srv.slice.byteOffset, srv.slice.byteLength, DXMT_ENCODER_RESOURCE_ACESS_READ);
           encoded_buffer[arg.StructurePtrOffset] = srv_alloc->gpuAddress() + offset + srv.slice.byteOffset;
           encoded_buffer[arg.StructurePtrOffset + 1] = srv.slice.byteLength;
           makeResident<stage, kind>(srv.buffer.ptr());
@@ -277,7 +288,7 @@ ArgumentEncodingContext::encodeShaderResources(
       } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) {
         if (srv.buffer.ptr()) {
           assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET);
-          auto [view, offset] = access(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ);
+          auto [view, offset] = access<PreRasterStage>(srv.buffer, srv.viewId, DXMT_ENCODER_RESOURCE_ACESS_READ);
           encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id;
           encoded_buffer[arg.StructurePtrOffset + 1] =
               ((uint64_t)srv.slice.elementCount << 32) | (uint64_t)(srv.slice.firstElement + offset);
@@ -286,7 +297,7 @@ ArgumentEncodingContext::encodeShaderResources(
           assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP);
           auto viewIdChecked = srv.texture->checkViewUseArray(srv.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY);
           encoded_buffer[arg.StructurePtrOffset] =
-              access(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID;
+              access<PreRasterStage>(srv.texture, viewIdChecked, DXMT_ENCODER_RESOURCE_ACESS_READ).gpuResourceID;
           encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(srv.texture->arrayLength(viewIdChecked), 0);
           makeResident<stage, kind>(srv.texture.ptr(), viewIdChecked);
         } else {
@@ -306,7 +317,7 @@ ArgumentEncodingContext::encodeShaderResources(
 
       if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_BUFFER) {
         if (uav.buffer.ptr()) {
-          auto [uav_alloc, offset] = access(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags);
+          auto [uav_alloc, offset] = access<PreRasterStage>(uav.buffer, uav.slice.byteOffset, uav.slice.byteLength, access_flags);
           encoded_buffer[arg.StructurePtrOffset] = uav_alloc->gpuAddress() + offset + uav.slice.byteOffset;
           encoded_buffer[arg.StructurePtrOffset + 1] = uav.slice.byteLength;
           makeResident<stage, kind>(uav.buffer.ptr(), read, write);
@@ -317,7 +328,7 @@ ArgumentEncodingContext::encodeShaderResources(
       } else if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE) {
         if (uav.buffer.ptr()) {
           assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TBUFFER_OFFSET);
-          auto [view, offset] = access(uav.buffer, uav.viewId, access_flags);
+          auto [view, offset] = access<PreRasterStage>(uav.buffer, uav.viewId, access_flags);
           encoded_buffer[arg.StructurePtrOffset] = view.gpu_resource_id;
           encoded_buffer[arg.StructurePtrOffset + 1] =
               ((uint64_t)uav.slice.elementCount << 32) | (uint64_t)(uav.slice.firstElement + offset);
@@ -325,7 +336,7 @@ ArgumentEncodingContext::encodeShaderResources(
         } else if (uav.texture.ptr()) {
           assert(arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_MINLOD_CLAMP);
           auto viewIdChecked = uav.texture->checkViewUseArray(uav.viewId, arg.Flags & MTL_SM50_SHADER_ARGUMENT_TEXTURE_ARRAY);
-          encoded_buffer[arg.StructurePtrOffset] = access(uav.texture, viewIdChecked, access_flags).gpuResourceID;
+          encoded_buffer[arg.StructurePtrOffset] = access<PreRasterStage>(uav.texture, viewIdChecked, access_flags).gpuResourceID;
           encoded_buffer[arg.StructurePtrOffset + 1] = TextureMetadata(uav.texture->arrayLength(viewIdChecked), 0);
           makeResident<stage, kind>(uav.texture.ptr(), viewIdChecked, read, write);
         } else {
@@ -335,7 +346,7 @@ ArgumentEncodingContext::encodeShaderResources(
       }
       if (arg.Flags & MTL_SM50_SHADER_ARGUMENT_UAV_COUNTER) {
         if (uav.counter) {
-          auto [counter_alloc, offset] = access(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE);
+          auto [counter_alloc, offset] = access<PreRasterStage>(uav.counter, 0, 4, DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE);
           encoded_buffer[arg.StructurePtrOffset + 2] = counter_alloc->gpuAddress() + offset;
           makeResident<stage, kind>(uav.counter.ptr(), true, true);
         } else {
@@ -395,6 +406,8 @@ ArgumentEncodingContext::clearColor(Rc<Texture> &&texture, unsigned viewId, unsi
   auto encoder_info = allocate<ClearEncoderData>();
   encoder_info->type = EncoderType::Clear;
   encoder_info->id = nextEncoderId();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->clear_dsv = 0;
   encoder_info->color = color;
   encoder_info->array_length = arrayLength;
@@ -417,6 +430,8 @@ ArgumentEncodingContext::clearDepthStencil(
   auto encoder_info = allocate<ClearEncoderData>();
   encoder_info->type = EncoderType::Clear;
   encoder_info->id = nextEncoderId();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->clear_dsv = flag & DepthStencilPlanarFlags(texture->pixelFormat());
   encoder_info->depth_stencil = {depth, stencil};
   encoder_info->array_length = arrayLength;
@@ -438,6 +453,9 @@ ArgumentEncodingContext::resolveTexture(
   assert(!encoder_current);
   auto encoder_info = allocate<ResolveEncoderData>();
   encoder_info->type = EncoderType::Resolve;
+  encoder_info->id = nextEncoderId();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_current = encoder_info;
 
   encoder_info->src = access(src, src_view, DXMT_ENCODER_RESOURCE_ACESS_READ);
@@ -452,14 +470,14 @@ ArgumentEncodingContext::present(Rc<Texture> &texture, Rc<Presenter> &presenter,
   auto encoder_info = allocate<PresentData>();
   encoder_info->type = EncoderType::Present;
   encoder_info->id = nextEncoderId();
-  encoder_info->backbuffer = texture->current()->texture();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->presenter = presenter;
   encoder_info->after = after;
   encoder_info->metadata = metadata;
 
-  encoder_info->tex_read.add(texture->current()->depkey);
-
   encoder_current = encoder_info;
+  encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture;
   endPass();
 }
 
@@ -469,14 +487,13 @@ ArgumentEncodingContext::upscale(Rc<Texture> &texture, Rc<Texture> &upscaled, WM
   auto encoder_info = allocate<SpatialUpscaleData>();
   encoder_info->type = EncoderType::SpatialUpscale;
   encoder_info->id = nextEncoderId();
-  encoder_info->backbuffer = texture->current()->texture();
-  encoder_info->upscaled = upscaled->current()->texture();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->scaler = scaler;
 
-  encoder_info->tex_read.add(texture->current()->depkey);
-  encoder_info->tex_write.add(upscaled->current()->depkey);
-
   encoder_current = encoder_info;
+  encoder_info->backbuffer = access(texture, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture;
+  encoder_info->upscaled = access(upscaled, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture;
   endPass();
 }
 
@@ -489,25 +506,19 @@ ArgumentEncodingContext::upscaleTemporal(
   auto encoder_info = allocate<TemporalUpscaleData>();
   encoder_info->type = EncoderType::TemporalUpscale;
   encoder_info->id = nextEncoderId();
-  encoder_info->input = input->current()->texture();
-  encoder_info->output = output->current()->texture();
-  encoder_info->depth = depth->current()->texture();
-  encoder_info->motion_vector = motion_vector->view(mvViewId).texture;
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->scaler = scaler;
   encoder_info->props = props;
 
-  encoder_info->tex_read.add(input->current()->depkey);
-  encoder_info->tex_read.add(depth->current()->depkey);
-  encoder_info->tex_read.add(motion_vector->current()->depkey);
-  encoder_info->tex_write.add(output->current()->depkey);
-  if(exposure) {
-    encoder_info->exposure = exposure->current()->texture();
-    encoder_info->tex_read.add(exposure->current()->depkey);
-  } else {
-    encoder_info->exposure = nullptr;
-  }
-
   encoder_current = encoder_info;
+  encoder_info->input = access(input, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture;
+  encoder_info->depth = access(depth, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture;
+  encoder_info->motion_vector = access(motion_vector, mvViewId, DXMT_ENCODER_RESOURCE_ACESS_READ).texture;
+  encoder_info->output = access(output, 0, DXMT_ENCODER_RESOURCE_ACESS_WRITE).texture;
+  if (exposure) {
+    encoder_info->exposure = access(exposure, 0, DXMT_ENCODER_RESOURCE_ACESS_READ).texture;
+  }
   endPass();
 }
 
@@ -516,7 +527,7 @@ ArgumentEncodingContext::signalEvent(uint64_t value) {
   assert(!encoder_current);
   auto encoder_info = allocate<SignalEventData>();
   encoder_info->type = EncoderType::SignalEvent;
-  encoder_info->id = nextEncoderId();
+  encoder_info->id = ~0ull;
   encoder_info->event = queue_.event;
   encoder_info->value = value;
 
@@ -529,7 +540,7 @@ ArgumentEncodingContext::signalEvent(WMT::Reference<WMT::Event> &&event, uint64_
   assert(!encoder_current);
   auto encoder_info = allocate<SignalEventData>();
   encoder_info->type = EncoderType::SignalEvent;
-  encoder_info->id = nextEncoderId();
+  encoder_info->id = ~0ull;
   encoder_info->event = std::move(event);
   encoder_info->value = value;
 
@@ -542,7 +553,7 @@ ArgumentEncodingContext::waitEvent(WMT::Reference<WMT::Event> &&event, uint64_t
   assert(!encoder_current);
   auto encoder_info = allocate<WaitForEventData>();
   encoder_info->type = EncoderType::WaitForEvent;
-  encoder_info->id = nextEncoderId();
+  encoder_info->id = ~0ull;
   encoder_info->event = std::move(event);
   encoder_info->value = value;
 
@@ -557,7 +568,12 @@ ArgumentEncodingContext::startRenderPass(
   assert(!encoder_current);
   auto encoder_info = allocate<RenderEncoderData>();
   encoder_info->type = EncoderType::Render;
+  encoder_info->encoder_id_vertex = nextEncoderId();
+  encoder_info->fence_wait_vertex = {};
+  encoder_info->fence_update_vertex = {encoder_info->encoder_id_vertex};
   encoder_info->id = nextEncoderId();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->cmd_head.type = WMTRenderCommandNop;
   encoder_info->cmd_head.next.set(0);
   encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head;
@@ -583,6 +599,8 @@ ArgumentEncodingContext::startComputePass(uint64_t encoder_argbuf_size) {
   auto encoder_info = allocate<ComputeEncoderData>();
   encoder_info->type = EncoderType::Compute;
   encoder_info->id = nextEncoderId();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->cmd_head.type = WMTComputeCommandNop;
   encoder_info->cmd_head.next.set(0);
   encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head;
@@ -603,6 +621,8 @@ ArgumentEncodingContext::startBlitPass() {
   auto encoder_info = allocate<BlitEncoderData>();
   encoder_info->type = EncoderType::Blit;
   encoder_info->id = nextEncoderId();
+  encoder_info->fence_wait = {};
+  encoder_info->fence_update = {encoder_info->id};
   encoder_info->cmd_head.type = WMTBlitCommandNop;
   encoder_info->cmd_head.next.set(0);
   encoder_info->cmd_tail = (wmtcmd_base *)&encoder_info->cmd_head;
@@ -622,6 +642,19 @@ ArgumentEncodingContext::endPass() {
   if (encoder_current->type == EncoderType::Render)
     vro_state_.endEncoder();
 
+  if (encoder_current->id != ~0ull) {
+    if (encoder_current->type == EncoderType::Render) {
+      auto render_encoder = static_cast<RenderEncoderData *>(encoder_current);
+      render_encoder->fence_wait_vertex =
+          fence_locality_.collectAndSimplifyWaits(render_encoder->fence_wait_vertex, render_encoder->encoder_id_vertex);
+      encoder_current->fence_wait =
+          fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id, true);
+    } else {
+      encoder_current->fence_wait =
+          fence_locality_.collectAndSimplifyWaits(encoder_current->fence_wait, encoder_last->id);
+    }
+  }
+
   encoder_current = nullptr;
   encoder_count_++;
 }
@@ -705,9 +738,10 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
   if (encoder_count > 1) {
     unsigned j, i;
     for (j = encoder_count - 2; j != ~0u; j--) {
-      if (encoders[j]->type == EncoderType::Null)
+      // TODO(fences): we don't actively move encoders other than clear and render
+      if (encoders[j]->type != EncoderType::Clear && encoders[j]->type != EncoderType::Render)
         continue;
-      for (i = j + 1; i < std::min(encoder_count, j + kEncoderOptimizerThreshold); i++) {
+      for (i = j + 1; i < encoder_count; i++) {
         if (encoders[i]->type == EncoderType::Null)
           continue;
         if (checkEncoderRelation(encoders[j], encoders[i]) == DXMT_ENCODER_LIST_OP_SYNCHRONIZE)
@@ -783,6 +817,11 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
       }
       auto gpu_buffer_ = data->allocated_argbuf;
       auto encoder = cmdbuf.renderCommandEncoder(render_pass_info);
+      data->fence_wait.forEach(
+          data->fence_wait_vertex, // if a fence is waited pre-raster, no need to wait again at fragment
+          [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStagePreRaster); },
+          [&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); }
+      );
       encoder.setVertexBuffer(gpu_buffer_, 0, 16);
       encoder.setVertexBuffer(gpu_buffer_, 0, 29);
       encoder.setVertexBuffer(gpu_buffer_, 0, 30);
@@ -859,6 +898,11 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
         );
       }
       encoder.encodeCommands(&data->cmd_head);
+      data->fence_update_vertex.forEach(
+          data->fence_update, // if a fence is updated at fragment, no need to update again pre-raster
+          [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); },
+          [&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStagePreRaster); }
+      );
       encoder.endEncoding();
       data->~RenderEncoderData();
       break;
@@ -866,6 +910,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
     case EncoderType::Compute: {
       auto data = static_cast<ComputeEncoderData *>(current);
       auto encoder = cmdbuf.computeCommandEncoder(false);
+      data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); });
       struct wmtcmd_compute_setbuffer setcmd;
       setcmd.type = WMTComputeCommandSetBuffer;
       setcmd.next.set(nullptr);
@@ -876,6 +921,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
       setcmd.index = 30;
       encoder.encodeCommands((const wmtcmd_compute_nop *)&setcmd);
       encoder.encodeCommands(&data->cmd_head);
+      data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); });
       encoder.endEncoding();
       data->~ComputeEncoderData();
       break;
@@ -883,7 +929,9 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
     case EncoderType::Blit: {
       auto data = static_cast<BlitEncoderData *>(current);
       auto encoder = cmdbuf.blitCommandEncoder();
+      data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id]); });
       encoder.encodeCommands(&data->cmd_head);
+      data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id]); });
       encoder.endEncoding();
       data->~BlitEncoderData();
       break;
@@ -891,7 +939,15 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
     case EncoderType::Present: {
       auto data = static_cast<PresentData *>(current);
       auto t0 = clock::now();
-      auto drawable = data->presenter->encodeCommands(cmdbuf, {}, data->backbuffer, data->metadata);
+      auto drawable = data->presenter->encodeCommands(
+          cmdbuf, data->backbuffer, data->metadata,
+          [&](WMT::RenderCommandEncoder encoder) {
+            data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); });
+          },
+          [&](WMT::RenderCommandEncoder encoder) {
+            data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); });
+          }
+      );
       auto t1 = clock::now();
       currentFrameStatistics().drawable_blocking_interval += (t1 - t0);
       if (data->after > 0)
@@ -930,6 +986,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
         info.render_target_array_length = data->array_length;
         auto encoder = cmdbuf.renderCommandEncoder(info);
         encoder.setLabel(WMT::String::string("ClearPass", WMTUTF8StringEncoding));
+        data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); });
+        data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); });
         encoder.endEncoding();
       }
       data->~ClearEncoderData();
@@ -947,6 +1005,8 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
 
         auto encoder = cmdbuf.renderCommandEncoder(info);
         encoder.setLabel(WMT::String::string("ResolvePass", WMTUTF8StringEncoding));
+        data->fence_wait.forEach([&](auto id) { encoder.waitForFence(fence_pool_[id], WMTRenderStageFragment); });
+        data->fence_update.forEach([&](auto id) { encoder.updateFence(fence_pool_[id], WMTRenderStageFragment); });
         encoder.endEncoding();
       }
       data->~ResolveEncoderData();
@@ -954,7 +1014,15 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
     }
     case EncoderType::SpatialUpscale: {
       auto data = static_cast<SpatialUpscaleData *>(current);
-      cmdbuf.encodeSpatialScale(data->scaler, data->backbuffer, data->upscaled, {});
+      auto fence_muxer = cmdbuf.blitCommandEncoder();
+      fence_muxer.setLabel(WMT::String::string("FenceMultiplexer", WMTUTF8StringEncoding));
+      data->fence_wait.forEach([&](auto id) { fence_muxer.waitForFence(fence_pool_[id]); });
+      data->fence_update.forEach([&](auto id) { fence_muxer.updateFence(fence_pool_[id]); });
+      fence_muxer.endEncoding();
+      data->fence_update.forEach([&](auto id) {
+        // TODO(fences): we are expecting fence_update contains exactly one fence
+        cmdbuf.encodeSpatialScale(data->scaler, data->backbuffer, data->upscaled, fence_pool_[id]);
+      });
       data->~SpatialUpscaleData();
       break;
     }
@@ -972,7 +1040,18 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
     }
     case EncoderType::TemporalUpscale: {
       auto data = static_cast<TemporalUpscaleData *>(current);
-      cmdbuf.encodeTemporalScale(data->scaler, data->input, data->output, data->depth, data->motion_vector, data->exposure, {}, data->props);
+      auto fence_muxer = cmdbuf.blitCommandEncoder();
+      fence_muxer.setLabel(WMT::String::string("FenceMultiplexer", WMTUTF8StringEncoding));
+      data->fence_wait.forEach([&](auto id) { fence_muxer.waitForFence(fence_pool_[id]); });
+      data->fence_update.forEach([&](auto id) { fence_muxer.updateFence(fence_pool_[id]); });
+      fence_muxer.endEncoding();
+      data->fence_update.forEach([&](auto id) {
+        // TODO(fences): we are expecting fence_update contains exactly one fence
+        cmdbuf.encodeTemporalScale(
+            data->scaler, data->input, data->output, data->depth, data->motion_vector, data->exposure, fence_pool_[id],
+            data->props
+        );
+      });
       data->~TemporalUpscaleData();
       break;
     }
@@ -998,6 +1077,7 @@ ArgumentEncodingContext::flushCommands(WMT::CommandBuffer cmdbuf, uint64_t seqId
 
 DXMT_ENCODER_LIST_OP
 ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *latter) {
+
   if (former->type == EncoderType::Null)
     return DXMT_ENCODER_LIST_OP_SWAP;
   if (latter->type == EncoderType::Null)
@@ -1025,7 +1105,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
             depth_attachment->clear_depth = clear->depth_stencil.first;
             depth_attachment->load_action = WMTLoadActionClear;
             depth_attachment->store_action = WMTStoreActionStore;
-            render->tex_write.merge(clear->tex_write);
+            render->fence_update.merge(clear->fence_update);
+            render->fence_wait.merge(clear->fence_wait);
+            render->fence_wait.subtract(clear->fence_update);
           }
           clear->clear_dsv &= ~1;
         }
@@ -1034,7 +1116,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
             stencil_attachment->clear_stencil = clear->depth_stencil.second;
             stencil_attachment->load_action = WMTLoadActionClear;
             stencil_attachment->store_action = WMTStoreActionStore;
-            render->tex_write.merge(clear->tex_write);
+            render->fence_update.merge(clear->fence_update);
+            render->fence_wait.merge(clear->fence_wait);
+            render->fence_wait.subtract(clear->fence_update);
           }
           clear->clear_dsv &= ~2;
         }
@@ -1050,8 +1134,11 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
           if (attachment->load_action == WMTLoadActionLoad) {
             attachment->load_action = WMTLoadActionClear;
             attachment->clear_color = clear->color;
-            if (attachment->store_action != WMTStoreActionDontCare)
-              render->tex_write.merge(clear->tex_write);
+            if (attachment->store_action != WMTStoreActionDontCare) {
+              render->fence_update.merge(clear->fence_update);
+              render->fence_wait.merge(clear->fence_wait);
+              render->fence_wait.subtract(clear->fence_update);
+            }
           }
 
           currentFrameStatistics().clear_pass_optimized++;
@@ -1081,7 +1168,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
       if (result.src) {
         result.src->store_action = WMTStoreActionStoreAndMultisampleResolve;
         result.src->resolve_attachment = result.dst;
-        render->tex_write.merge(resolve->tex_write);
+        render->fence_update.merge(resolve->fence_update);
+        render->fence_wait.merge(resolve->fence_wait);
+        render->fence_wait.subtract(resolve->fence_update);
 
         currentFrameStatistics().resolve_pass_optimized++;
         resolve->~ResolveEncoderData();
@@ -1097,7 +1186,9 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
     auto r1 = reinterpret_cast<RenderEncoderData *>(latter);
     auto r0 = reinterpret_cast<RenderEncoderData *>(former);
 
-    if (isEncoderSignatureMatched(r0, r1)) {
+    if (isEncoderSignatureMatched(r0, r1) &&
+        // can't merge if latter's vertex wait for former's fragment
+        !r1->fence_wait_vertex.intersectedWith(r0->fence_update)) {
       for (unsigned i = 0; i < r0->render_target_count; i++) {
         auto &a0 = r0->colors[i];
         auto &a1 = r1->colors[i];
@@ -1134,10 +1225,19 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
       r1->ts_arg_marshal_tasks = std::move(r0->ts_arg_marshal_tasks);
       r1->use_visibility_result = r0->use_visibility_result || r1->use_visibility_result;
 
-      r1->buf_read.merge(r0->buf_read);
-      r1->buf_write.merge(r0->buf_write);
-      r1->tex_read.merge(r0->tex_read);
-      r1->tex_write.merge(r0->tex_write);
+      r1->fence_update.merge(r0->fence_update);
+      r1->fence_wait.merge(r0->fence_wait);
+      r1->fence_wait.subtract(r0->fence_update);
+      r1->fence_update_vertex.merge(r0->fence_update_vertex);
+      r1->fence_wait_vertex.merge(r0->fence_wait_vertex);
+      r1->fence_wait_vertex.subtract(r0->fence_update_vertex);
+
+      // just in case
+      r1->fence_wait.subtract(r0->fence_update_vertex);
+      /* 
+      r1->fence_wait_vertex.subtract(r0->fence_update);
+      does not make sense
+      */
 
       currentFrameStatistics().render_pass_optimized++;
       r0->~RenderEncoderData();
@@ -1153,26 +1253,26 @@ ArgumentEncodingContext::checkEncoderRelation(EncoderData *former, EncoderData *
 
 bool
 ArgumentEncodingContext::hasDataDependency(EncoderData *latter, EncoderData *former) {
-  if (latter->type == EncoderType::Clear && former->type == EncoderType::Clear) {
-    // FIXME: prove it's safe to return false
-    return false;
+  if (former->type == EncoderType::Render) {
+    auto r0 = reinterpret_cast<RenderEncoderData *>(former);
+    FenceSet fence_wait_r0 = r0->fence_wait.unionOf(r0->fence_wait_vertex);
+    FenceSet fence_update_r0 = r0->fence_update_vertex.unionOf(r0->fence_update);
+    if (latter->type == EncoderType::Render) {
+      auto r1 = reinterpret_cast<RenderEncoderData *>(latter);
+      FenceSet fence_wait_r1 = r1->fence_wait.unionOf(r1->fence_wait_vertex);
+      FenceSet fence_update_r1 = r1->fence_update_vertex.unionOf(r1->fence_update);
+      return fence_update_r0.intersectedWith(fence_wait_r1) || fence_update_r1.intersectedWith(fence_wait_r0);
+    }
+    return fence_update_r0.intersectedWith(latter->fence_wait) || latter->fence_update.intersectedWith(fence_wait_r0);
+  }
+  if (latter->type == EncoderType::Render) {
+    auto r1 = reinterpret_cast<RenderEncoderData *>(latter);
+    FenceSet fence_wait = r1->fence_wait.unionOf(r1->fence_wait_vertex);
+    FenceSet fence_update = r1->fence_update_vertex.unionOf(r1->fence_update);
+    return former->fence_update.intersectedWith(fence_wait) || fence_update.intersectedWith(former->fence_wait);
   }
-  // read-after-write
-  if (!former->buf_write.isDisjointWith(latter->buf_read))
-    return true;
-  if (!former->tex_write.isDisjointWith(latter->tex_read))
-    return true;
-  // write-after-write
-  if (!former->buf_write.isDisjointWith(latter->buf_write))
-    return true;
-  if (!former->tex_write.isDisjointWith(latter->tex_write))
-    return true;
-  // write-after-read
-  if (!former->buf_read.isDisjointWith(latter->buf_write))
-    return true;
-  if (!former->tex_read.isDisjointWith(latter->tex_write))
-    return true;
-  return false;
+  return former->fence_update.intersectedWith(latter->fence_wait) ||
+         latter->fence_update.intersectedWith(former->fence_wait);
 }
 
 bool
diff --git a/src/dxmt/dxmt_context.hpp b/src/dxmt/dxmt_context.hpp
index 63f35e0e..e5ad1718 100644
--- a/src/dxmt/dxmt_context.hpp
+++ b/src/dxmt/dxmt_context.hpp
@@ -93,11 +93,9 @@ enum class EncoderType {
 struct EncoderData {
   EncoderType type;
   EncoderData *next = nullptr;
-  uint64_t id;
-  EncoderDepSet buf_read;
-  EncoderDepSet buf_write;
-  EncoderDepSet tex_read;
-  EncoderDepSet tex_write;
+  EncoderId id;
+  FenceSet fence_wait;
+  FenceSet fence_update;
 };
 
 struct GSDispatchArgumentsMarshal {
@@ -167,6 +165,9 @@ struct RenderEncoderData : EncoderData {
   wmtcmd_base *cmd_tail;
   WMT::Buffer allocated_argbuf;
   uint64_t allocated_argbuf_offset;
+  uint64_t encoder_id_vertex;
+  FenceSet fence_wait_vertex;
+  FenceSet fence_update_vertex;
   void *allocated_argbuf_mapping;
   uint8_t dsv_planar_flags;
   uint8_t dsv_readonly_flags;
@@ -283,12 +284,6 @@ enum DXMT_ENCODER_LIST_OP {
 
 class CommandQueue;
 
-enum DXMT_ENCODER_RESOURCE_ACESS {
-  DXMT_ENCODER_RESOURCE_ACESS_READ = 1 <<0,
-  DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1,
-  DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE,
-};
-
 struct AllocatedTempBufferSlice {
   WMT::Buffer gpu_buffer;
   uint64_t offset;
@@ -296,55 +291,83 @@ struct AllocatedTempBufferSlice {
 };
 
 class ArgumentEncodingContext {
+  template <bool PreRasterStage>
   void
   trackBuffer(BufferAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) {
     retainAllocation(allocation);
     if (allocation->flags().test(BufferAllocationFlag::GpuReadonly))
       return;
-    if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ)
-      encoder_current->buf_read.add(allocation->depkey);
+    auto &tracker = allocation->fenceTrackers[allocation->currentSuballocation()];
+    if constexpr (PreRasterStage) {
+      auto current_encoder = currentRenderEncoder();
+      auto id = current_encoder->encoder_id_vertex;
+      if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE)
+        tracker.write(id, current_encoder->fence_wait_vertex);
+      else
+        tracker.read(id, current_encoder->fence_wait_vertex);
+      return;
+    }
+    auto current_encoder = currentEncoder();
     if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE)
-      encoder_current->buf_write.add(allocation->depkey);
+      tracker.write(currentEncoderId(), current_encoder->fence_wait);
+    else
+      tracker.read(currentEncoderId(), current_encoder->fence_wait);
   }
 
+  template <bool PreRasterStage = false>
   void
   trackTexture(TextureAllocation *allocation, DXMT_ENCODER_RESOURCE_ACESS flags) {
     retainAllocation(allocation);
     if (allocation->flags().test(TextureAllocationFlag::GpuReadonly))
       return;
-    if (flags & DXMT_ENCODER_RESOURCE_ACESS_READ)
-      encoder_current->tex_read.add(allocation->depkey);
+    auto &tracker = allocation->fenceTracker;
+    if constexpr (PreRasterStage) {
+      auto current_encoder = currentRenderEncoder();
+      auto id = current_encoder->encoder_id_vertex;
+      if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE)
+        tracker.write(id, current_encoder->fence_wait_vertex);
+      else
+        tracker.read(id, current_encoder->fence_wait_vertex);
+      return;
+    }
+    auto current_encoder = currentEncoder();
     if (flags & DXMT_ENCODER_RESOURCE_ACESS_WRITE)
-      encoder_current->tex_write.add(allocation->depkey);
+      tracker.write(currentEncoderId(), current_encoder->fence_wait);
+    else
+      tracker.read(currentEncoderId(), current_encoder->fence_wait);
   }
 
 public:
+  template<bool PreRasterStage = false>
   std::pair<BufferAllocation *, uint64_t>
   access(Rc<Buffer> const &buffer, unsigned offset, unsigned length, DXMT_ENCODER_RESOURCE_ACESS flags) {
     auto allocation = buffer->current();
-    trackBuffer(allocation, flags);
+    trackBuffer<PreRasterStage>(allocation, flags);
     return {allocation, allocation->currentSuballocationOffset()};
   }
 
+  template<bool PreRasterStage = false>
   std::pair<BufferView const &, uint32_t>
   access(Rc<Buffer> const &buffer, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) {
     auto allocation = buffer->current();
-    trackBuffer(allocation, flags);
+    trackBuffer<PreRasterStage>(allocation, flags);
     auto &view = buffer->view_(viewId, allocation);
     return {view, allocation->currentSuballocationOffset(view.suballocation_texel)};
   }
 
+  template<bool PreRasterStage = false>
   WMT::Texture
   access(Rc<Texture> const &texture, unsigned level, unsigned slice, DXMT_ENCODER_RESOURCE_ACESS flags) {
     auto allocation = texture->current();
-    trackTexture(allocation, flags);
+    trackTexture<PreRasterStage>(allocation, flags);
     return allocation->texture();
   }
 
+  template<bool PreRasterStage = false>
   TextureView &
   access(Rc<Texture> const &texture, unsigned viewId, DXMT_ENCODER_RESOURCE_ACESS flags) {
     auto allocation = texture->current();
-    trackTexture(allocation, flags);
+    trackTexture<PreRasterStage>(allocation, flags);
     return texture->view(viewId, allocation);
   }
 
@@ -425,7 +448,7 @@ class ArgumentEncodingContext {
   std::pair<WMT::Buffer, uint64_t>
   currentIndexBuffer() {
     // because of indirect draw, we can't predicate the accessed buffer range
-    auto [ibuf_alloc, offset] = access(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ);
+    auto [ibuf_alloc, offset] = access<true>(ibuf_, 0, ibuf_->length(), DXMT_ENCODER_RESOURCE_ACESS_READ);
     return {ibuf_alloc->buffer(), offset};
   };
 
@@ -586,8 +609,7 @@ class ArgumentEncodingContext {
 
   uint64_t
   nextEncoderId() {
-    static std::atomic_uint64_t global_id = 0;
-    return global_id.fetch_add(1);
+    return encoder_id_++;
   };
 
   void clearColor(Rc<Texture> &&texture, unsigned viewId, unsigned arrayLength, WMTClearColor color);
@@ -610,9 +632,15 @@ class ArgumentEncodingContext {
     return encoder_current;
   }
 
+  constexpr uint64_t
+  currentEncoderId() {
+    assert(encoder_current);
+    return encoder_current->id;
+  }
+
   constexpr RenderEncoderData *
   currentRenderEncoder() {
-    assert(encoder_current->type == EncoderType::Render);
+    assert(encoder_current && encoder_current->type == EncoderType::Render);
     return static_cast<RenderEncoderData *>(encoder_current);
   }
 
@@ -750,10 +778,14 @@ class ArgumentEncodingContext {
   void *dummy_cbuffer_host_;
   WMTBufferInfo dummy_cbuffer_info_;
 
-  EncoderData encoder_head = {EncoderType::Null, nullptr};
+  EncoderData encoder_head = {EncoderType::Null, nullptr, ~0ull};
   EncoderData *encoder_last = &encoder_head;
   EncoderData *encoder_current = nullptr;
   unsigned encoder_count_ = 0;
+  
+  uint64_t encoder_id_ = kParityLane; // actually important to not start from 0
+  std::array<WMT::Reference<WMT::Fence>, kParityLane> fence_pool_;
+  FenceLocalityCheck fence_locality_;
 
   uint64_t seq_id_;
   uint64_t frame_id_;
diff --git a/src/dxmt/dxmt_deptrack.cpp b/src/dxmt/dxmt_deptrack.cpp
new file mode 100644
index 00000000..8b3fe3e9
--- /dev/null
+++ b/src/dxmt/dxmt_deptrack.cpp
@@ -0,0 +1,77 @@
+#include "dxmt_deptrack.hpp"
+#include <cassert>
+
+namespace dxmt {
+
+void
+GenericAccessTracker::read(EncoderId id, FenceSet &wait_fences) {
+  if (read_.add(id)) {
+    write_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); });
+  }
+}
+
+void
+GenericAccessTracker::write(EncoderId id, FenceSet &wait_fences) {
+  if (write_.check(id)) {
+    write_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); });
+    read_.enumerate(id, [&](EncoderId id) { wait_fences.set(id); });
+    read_.clear();
+    write_.clearAndAdd(id);
+  }
+}
+
+class WeakFenceMaskLTO {
+public:
+  constexpr WeakFenceMaskLTO() {
+    int i = 0;
+    for (int p = 0; p < kParity; ++p) {
+      for (int l = 0; l < kLane; ++l) {
+        weak_fences_lto[i++].fillGenerationBefore(p, l);
+      }
+    }
+  }
+
+  const FenceSet &
+  operator[](EncoderId i) const {
+    return weak_fences_lto[i % kParityLane];
+  }
+
+private:
+  FenceSet weak_fences_lto[kParityLane];
+};
+
+constexpr auto WEAK_FENCE_MASK = WeakFenceMaskLTO();
+
+FenceSet
+FenceLocalityCheck::collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait) {
+  if (implicit_pre_raster_wait)
+    strong_fences.set(id - 1);
+
+  FenceSet full_fences(strong_fences);
+  full_fences.mergeWithLaneMaskOff(WEAK_FENCE_MASK[id], strong_fences.laneMask());
+
+  FenceSet minimal_fences;
+  FenceSet accessible_fences;
+
+  constexpr auto start_offset = kParityLane == 1 ? 0 : 1;
+
+  for (auto offset = start_offset; offset < kParityLane; offset++) {
+    EncoderId prev_encoder_id = id - offset;
+
+    if (full_fences.test(prev_encoder_id) && !accessible_fences.testAndSet(prev_encoder_id))
+      minimal_fences.set(prev_encoder_id);
+    if (accessible_fences.test(prev_encoder_id))
+      accessible_fences.merge(summary_[prev_encoder_id % kParityLane]);
+    if (accessible_fences.contains(full_fences))
+      break;
+  }
+
+  summary_[id % kParityLane] = full_fences;
+
+  if (implicit_pre_raster_wait)
+    minimal_fences.unset(id - 1);
+
+  return minimal_fences;
+}
+
+} // namespace dxmt
\ No newline at end of file
diff --git a/src/dxmt/dxmt_deptrack.hpp b/src/dxmt/dxmt_deptrack.hpp
index be001a85..e6eefccc 100644
--- a/src/dxmt/dxmt_deptrack.hpp
+++ b/src/dxmt/dxmt_deptrack.hpp
@@ -1,7 +1,297 @@
 #pragma once
-#include "util_bloom.hpp"
+#include <cstdint>
+#include <array>
+#include <cassert>
+#include <cstring>
 
 namespace dxmt {
-using EncoderDepSet = PartitionedBloomFilter64<16>;
-using EncoderDepKey = EncoderDepSet::Key;
+
+enum DXMT_ENCODER_RESOURCE_ACESS {
+  DXMT_ENCODER_RESOURCE_ACESS_READ = 1 << 0,
+  DXMT_ENCODER_RESOURCE_ACESS_WRITE = 1 << 1,
+  DXMT_ENCODER_RESOURCE_ACESS_READWRITE = DXMT_ENCODER_RESOURCE_ACESS_READ | DXMT_ENCODER_RESOURCE_ACESS_WRITE,
+};
+
+constexpr auto kLog2Lane = 6ull;
+constexpr auto kLane = 1 << kLog2Lane;
+constexpr auto kLaneMask = kLane - 1;
+constexpr auto kAllLaneMask = ~0ull >> (64 /* uint64_t */ - kLane);
+constexpr auto kParity = 4; // can also use 3, although power of 2 is nice
+constexpr auto kParityLane = kParity * kLane;
+
+static_assert(kLog2Lane <= 6);
+static_assert(kLane > 1);
+
+using LaneStorage = uint64_t;
+using EncoderId = uint64_t;
+
+constexpr auto
+PARITY(EncoderId id) {
+  return (id >> kLog2Lane) % kParity;
+}
+
+constexpr auto
+LANE(EncoderId id) {
+  return id & kLaneMask;
+}
+
+class FenceSet {
+public:
+  constexpr FenceSet() {
+    for (int i = 0; i < kParity; i++) {
+      storage_[i] = 0;
+    }
+  }
+
+  constexpr FenceSet(EncoderId id) {
+    for (int i = 0; i < kParity; i++) {
+      storage_[i] = 0;
+    }
+    set(id);
+  }
+
+  FenceSet(const FenceSet &copy) {
+    memcpy(&storage_, &copy.storage_, sizeof(storage_));
+  }
+
+  FenceSet &
+  operator=(const dxmt::FenceSet &copy) {
+    memcpy(&storage_, &copy.storage_, sizeof(storage_));
+    return *this;
+  }
+
+  ~FenceSet() = default;
+
+  constexpr void
+  set(EncoderId id) {
+    storage_[PARITY(id)] |= (1ull << LANE(id));
+  }
+
+  constexpr void
+  unset(EncoderId id) {
+    storage_[PARITY(id)] &= (kAllLaneMask & ~(1ull << LANE(id)));
+  }
+
+  constexpr void
+  fillGenerationBefore(int parity, int lane) {
+    const int idx = (parity + kParity + (kParity - 1)) * kLane + lane;
+    for (int offset = 0; offset < kLane; ++offset) {
+      set(idx - offset);
+    }
+  }
+
+  constexpr bool
+  test(EncoderId id) const {
+    return storage_[PARITY(id)] & (1ull << LANE(id));
+  }
+
+  constexpr bool
+  testAndSet(EncoderId id) {
+    auto P = PARITY(id);
+    auto LM = 1ull << LANE(id);
+    if (storage_[P] & LM)
+      return true;
+    storage_[P] |= LM;
+    return false;
+  }
+
+  FenceSet &
+  intersect(const FenceSet &set) {
+    for (int i = 0; i < kParity; i++) {
+      storage_[i] &= set.storage_[i];
+    }
+    return *this;
+  }
+
+  constexpr bool
+  intersectedWith(const FenceSet &set) const {
+    for (int i = 0; i < kParity; i++) {
+      if (storage_[i] & set.storage_[i])
+        return true;
+    }
+    return false;
+  }
+
+  constexpr bool
+  contains(const FenceSet &set) const {
+    for (int i = 0; i < kParity; i++) {
+      if ((storage_[i] & set.storage_[i]) != set.storage_[i])
+        return false;
+    }
+    return true;
+  }
+
+  FenceSet &
+  merge(const FenceSet &set) {
+    for (int i = 0; i < kParity; i++) {
+      storage_[i] |= set.storage_[i];
+    }
+    return *this;
+  }
+
+  FenceSet
+  unionOf(const FenceSet &set) const {
+    FenceSet ret{};
+    for (int i = 0; i < kParity; i++) {
+      ret.storage_[i] = storage_[i] | set.storage_[i];
+    }
+    return ret;
+  }
+
+  FenceSet &
+  subtract(const FenceSet &set) {
+    for (int i = 0; i < kParity; i++) {
+      storage_[i] &= (kAllLaneMask & ~set.storage_[i]);
+    }
+    return *this;
+  }
+
+  FenceSet &
+  mergeWithLaneMaskOff(const FenceSet &set, const LaneStorage &mask) {
+    for (int i = 0; i < kParity; i++) {
+      storage_[i] |= (set.storage_[i] & (kAllLaneMask & ~mask));
+    }
+    return *this;
+  }
+
+  LaneStorage
+  laneMask() const {
+    LaneStorage ret = 0;
+    for (int i = 0; i < kParity; i++) {
+      ret |= storage_[i];
+    }
+    return ret;
+  }
+
+  template <typename Fn>
+  void
+  forEach(Fn &&fn) {
+    static_assert(kParity == 4);
+    for (int P = 0; P < kParity; P++) {
+      auto lanes = storage_[P];
+      if (lanes == 0)
+        continue;
+      for (int L = 0; L < kLane; L++) {
+        if (lanes & (1ull << L)) {
+          fn(P * kLane + L);
+        }
+      }
+    }
+
+    // for (int i = 0; i < kParityLane; i++) {
+    //   if (test(i)) {
+    //     fn(i);
+    //   }
+    // }
+  }
+
+  template <typename Fn, typename FnPrior>
+  void
+  forEach(const FenceSet &prior, FnPrior &&fnPrior, Fn &&fn) {
+    static_assert(kParity == 4);
+    for (int P = 0; P < kParity; P++) {
+      auto lanes = storage_[P];
+      auto lanes_prior = prior.storage_[P];
+      if ((lanes | lanes_prior) == 0)
+        continue;
+      for (int L = 0; L < kLane; L++) {
+        if (lanes_prior & (1ull << L)) {
+          fnPrior(P * kLane + L);
+        } else if (lanes & (1ull << L)) {
+          fn(P * kLane + L);
+        }
+      }
+    }
+
+    // for (int i = 0; i < kParityLane; i++) {
+    //   if (prior.test(i)) {
+    //     fnPrior(i);
+    //   } else if (test(i)) {
+    //     fn(i);
+    //   }
+    // }
+  }
+
+private:
+  LaneStorage storage_[kParity];
+};
+
+class TrackingSet {
+public:
+  TrackingSet() {
+    cursor = 0;
+    clear();
+  };
+
+  bool
+  add(EncoderId id) {
+    assert(storage_[cursor] <= id);
+    if (storage_[cursor] == id)
+      return false;
+    bumpCursor();
+    storage_[cursor] = id;
+    return true;
+  };
+
+  bool
+  check(EncoderId id) {
+    return storage_[cursor] != id;
+  }
+
+  void
+  clear() {
+    storage_[cursor] = 0;
+  };
+
+  void
+  clearAndAdd(EncoderId id) {
+    storage_[cursor] = 0;
+    bumpCursor();
+    storage_[cursor] = id;
+  };
+
+  template <typename Fn>
+  void
+  enumerate(EncoderId id, Fn &&fn) {
+    for (int i = 0; i < kLane; i++) {
+      auto c = storage_[(cursor + kLane - i) & kLaneMask];
+      if (c == id)
+        continue;
+      if (c > (id - kLane)) {
+        fn(c);
+        continue;
+      }
+      break;
+    }
+  }
+
+private:
+  void
+  bumpCursor() {
+    cursor++;
+    cursor &= kLaneMask;
+  }
+
+  EncoderId storage_[kLane];
+  uint32_t cursor;
+};
+
+class GenericAccessTracker {
+public:
+  void read(EncoderId id, FenceSet &);
+  void write(EncoderId id, FenceSet &);
+
+private:
+  TrackingSet read_;
+  TrackingSet write_;
+};
+
+class FenceLocalityCheck {
+public:
+  FenceSet collectAndSimplifyWaits(FenceSet strong_fences, EncoderId id, bool implicit_pre_raster_wait = false);
+
+private:
+  std::array<FenceSet, kParityLane> summary_;
+};
+
 } // namespace dxmt
diff --git a/src/dxmt/dxmt_presenter.cpp b/src/dxmt/dxmt_presenter.cpp
index 1bd4fcef..f1da2b0f 100644
--- a/src/dxmt/dxmt_presenter.cpp
+++ b/src/dxmt/dxmt_presenter.cpp
@@ -115,7 +115,9 @@ Presenter::synchronizeLayerProperties() {
 
 WMT::MetalDrawable
 Presenter::encodeCommands(
-    WMT::CommandBuffer cmdbuf, WMT::Fence fence, WMT::Texture backbuffer, DXMTPresentMetadata metadata
+    WMT::CommandBuffer cmdbuf, WMT::Texture backbuffer, DXMTPresentMetadata metadata,
+    std::function<void(WMT::RenderCommandEncoder)> &&wait_fences,
+    std::function<void(WMT::RenderCommandEncoder)> &&update_fences
 ) {
   auto drawable = layer_.nextDrawable();
 
@@ -125,8 +127,7 @@ Presenter::encodeCommands(
   info.colors[0].store_action = WMTStoreActionStore;
   info.colors[0].texture = drawable.texture();
   auto encoder = cmdbuf.renderCommandEncoder(info);
-  if (fence)
-    encoder.waitForFence(fence, WMTRenderStageFragment);
+  wait_fences(encoder);
   encoder.setFragmentTexture(backbuffer, 0);
 
   double width = layer_props_.drawable_width;
@@ -140,7 +141,7 @@ Presenter::encodeCommands(
   }
   encoder.setViewport({0, 0, width, height, 0, 1});
   encoder.drawPrimitives(WMTPrimitiveTypeTriangle, 0, 3);
-
+  update_fences(encoder);
   encoder.endEncoding();
 
   return drawable;
diff --git a/src/dxmt/dxmt_presenter.hpp b/src/dxmt/dxmt_presenter.hpp
index 08171f56..de29249d 100644
--- a/src/dxmt/dxmt_presenter.hpp
+++ b/src/dxmt/dxmt_presenter.hpp
@@ -54,8 +54,11 @@ class Presenter : public RcObject {
 
   PresentState synchronizeLayerProperties();
 
-  WMT::MetalDrawable
-  encodeCommands(WMT::CommandBuffer cmdbuf, WMT::Fence fence, WMT::Texture backbuffer, DXMTPresentMetadata metadata);
+  WMT::MetalDrawable encodeCommands(
+      WMT::CommandBuffer cmdbuf, WMT::Texture backbuffer, DXMTPresentMetadata metadata,
+      std::function<void(WMT::RenderCommandEncoder)> &&wait_fences,
+      std::function<void(WMT::RenderCommandEncoder)> &&update_fences
+  );
 
 private:
   void buildRenderPipelineState(bool is_pq, bool with_hdr_metadata, bool is_ms);
diff --git a/src/dxmt/dxmt_texture.cpp b/src/dxmt/dxmt_texture.cpp
index bf79e19c..d508a783 100644
--- a/src/dxmt/dxmt_texture.cpp
+++ b/src/dxmt/dxmt_texture.cpp
@@ -50,7 +50,6 @@ TextureAllocation::TextureAllocation(
 
   gpuResourceID = info_copy.gpu_resource_id;
   machPort = 0;
-  depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1));
 };
 
 TextureAllocation::TextureAllocation(
@@ -63,7 +62,6 @@ TextureAllocation::TextureAllocation(
   mappedMemory = nullptr;
   gpuResourceID = textureDescriptor.gpu_resource_id;
   machPort = textureDescriptor.mach_port;
-  depkey = EncoderDepSet::generateNewKey(global_texture_seq.fetch_add(1));
 };
 
 TextureAllocation::~TextureAllocation(){
@@ -159,12 +157,9 @@ Texture::Texture(
 
 Rc<TextureAllocation>
 Texture::allocate(Flags<TextureAllocationFlag> flags) {
-  WMTResourceOptions options = WMTResourceStorageModeShared;
+  WMTResourceOptions options = WMTResourceHazardTrackingModeUntracked;
   WMTTextureInfo info = info_; // copy
   info.mach_port = 0;
-  if (flags.test(TextureAllocationFlag::GpuReadonly)) {
-    options |= WMTResourceHazardTrackingModeUntracked;
-  }
   if (flags.test(TextureAllocationFlag::CpuWriteCombined)) {
     options |= WMTResourceOptionCPUCacheModeWriteCombined;
   }
diff --git a/src/dxmt/dxmt_texture.hpp b/src/dxmt/dxmt_texture.hpp
index ccf28379..bfe3cbf5 100644
--- a/src/dxmt/dxmt_texture.hpp
+++ b/src/dxmt/dxmt_texture.hpp
@@ -98,7 +98,7 @@ class TextureAllocation : public Allocation {
   void *mappedMemory;
   uint64_t gpuResourceID;
   mach_port_t machPort;
-  EncoderDepKey depkey;
+  GenericAccessTracker fenceTracker;
 
 private:
   TextureAllocation(
diff --git a/src/dxmt/meson.build b/src/dxmt/meson.build
index 0eac0d64..6cb08d99 100644
--- a/src/dxmt/meson.build
+++ b/src/dxmt/meson.build
@@ -17,6 +17,7 @@ dxmt_src = [
   'dxmt_sampler.cpp',
   'dxmt_resource_initializer.cpp',
   'dxmt_shader_cache.cpp',
+  'dxmt_deptrack.cpp',
 ]
 
 dxmt_shaders = [
diff --git a/src/winemetal/winemetal.h b/src/winemetal/winemetal.h
index 767f84ca..0e777416 100644
--- a/src/winemetal/winemetal.h
+++ b/src/winemetal/winemetal.h
@@ -1113,6 +1113,7 @@ enum WMTRenderStages : uint8_t {
   WMTRenderStageTile = 4,
   WMTRenderStageObject = 8,
   WMTRenderStageMesh = 16,
+  WMTRenderStagePreRaster = WMTRenderStageVertex | WMTRenderStageObject | WMTRenderStageMesh,
 };
 
 struct wmtcmd_render_useresource {