From 66ce8ed5f6721f4c2cee26f86ca5cc39445a6778 Mon Sep 17 00:00:00 2001 From: Mat Carter Date: Mon, 9 Mar 2026 16:41:10 -0700 Subject: [PATCH 1/5] ARM64 Dekker-pattern StoreLoad fences in java.util.concurrent + VirtualThread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On ARM64, volatile write (STLR/release) + volatile read (LDAR/acquire) to different addresses does NOT provide StoreLoad ordering. This breaks Dekker-like protocols where one side writes field A then reads field B, while the other writes B then reads A — both sides can miss each other's stores. This adds U.fullFence() / VarHandle.fullFence() at all identified Dekker-pattern sites: VirtualThread.java: - afterYield() PARKING path: between setState(PARKED/TIMED_PARKED) and reading parkPermit (Dekker with unpark) - afterYield() BLOCKING path: between setState(BLOCKED) and reading blockPermit (Dekker with unblock) - afterYield() WAITING path: between setState(WAIT/TIMED_WAIT) and reading notified (Dekker with notify); fences in both untimed and timed sub-paths (adapted for tip's per-path inline checks) - afterDone(): between setState(TERMINATED) and reading notifyAllAfterTerminate (Dekker with beforeJoin) - unpark(): between getAndSetParkPermit(true) and reading state (Dekker with afterYield PARKING path) - unblock(): between blockPermit=true and reading state (Dekker with afterYield BLOCKING path) LinkedTransferQueue.java: - xfer(): between cmpExItem CAS and reading waiter (Dekker with await() which writes waiter then reads item) SynchronousQueue.java: - xferLifo(): between cmpExItem CAS and reading waiter (same Dekker as LinkedTransferQueue) AbstractQueuedSynchronizer.java: - acquire(): between node.status=WAITING and re-reading state in tryAcquire/tryAcquireShared (Dekker with release/releaseShared) - release(): between tryRelease state update and reading node.status in signalNext - releaseShared(): same as release() These fences are correctness-critical on ARM64, functionally redundant on x86 (TSO already provides StoreLoad), and appear only on non-hot paths (state transitions, not tight loops). --- .../classes/java/lang/VirtualThread.java | 31 +++++++++++++++++++ .../util/concurrent/LinkedTransferQueue.java | 5 +++ .../util/concurrent/SynchronousQueue.java | 5 +++ .../locks/AbstractQueuedSynchronizer.java | 16 ++++++++++ 4 files changed, 57 insertions(+) diff --git a/src/java.base/share/classes/java/lang/VirtualThread.java b/src/java.base/share/classes/java/lang/VirtualThread.java index f058f967b9135..f650436a4478a 100644 --- a/src/java.base/share/classes/java/lang/VirtualThread.java +++ b/src/java.base/share/classes/java/lang/VirtualThread.java @@ -579,6 +579,13 @@ private void afterYield() { setState(newState = TIMED_PARKED); } + // Full fence (StoreLoad) to ensure the PARKED/TIMED_PARKED state + // is visible before reading parkPermit (Dekker pattern with + // unpark which writes parkPermit then reads state). + // Note: storeFence is insufficient — on ARM64 it only emits + // LoadStore+StoreStore (dmb ishst), not StoreLoad (dmb ish). + U.fullFence(); + // may have been unparked while parking if (parkPermit && compareAndSetState(newState, UNPARKED)) { // lazy submit if local queue is empty @@ -604,6 +611,10 @@ private void afterYield() { if (s == BLOCKING) { setState(BLOCKED); + // Full fence (StoreLoad) for Dekker pattern with unblock + // which writes blockPermit then reads state. + U.fullFence(); + // may have been unblocked while blocking if (blockPermit && compareAndSetState(BLOCKED, UNBLOCKED)) { // lazy submit if local queue is empty @@ -619,6 +630,9 @@ private void afterYield() { boolean interruptible = interruptibleWait; if (s == WAITING) { setState(newState = WAIT); + // Full fence (StoreLoad) for Dekker pattern with notify + // which writes notified then reads state. + U.fullFence(); // may have been notified while in transition blocked = notified && compareAndSetState(WAIT, BLOCKED); } else { @@ -635,6 +649,9 @@ private void afterYield() { byte seqNo = ++timedWaitSeqNo; timeoutTask = schedule(() -> waitTimeoutExpired(seqNo), timeout, MILLISECONDS); setState(newState = TIMED_WAIT); + // Full fence (StoreLoad) for Dekker pattern with notify + // which writes notified then reads state. + U.fullFence(); // May have been notified while in transition. This must be done while // holding the monitor to avoid changing the state of a new timed wait call. blocked = notified && compareAndSetState(TIMED_WAIT, BLOCKED); @@ -675,6 +692,15 @@ private void afterDone(boolean notifyContainer) { assert carrierThread == null; setState(TERMINATED); + // Full fence (StoreLoad) to ensure the TERMINATED state is + // visible before reading notifyAllAfterTerminate (Dekker pattern + // with beforeJoin which writes notifyAllAfterTerminate then + // reads state). Without this, on ARM64 the volatile write of + // state and the subsequent volatile read can be reordered, + // causing a missed-wakeup where both sides miss each other's + // store. + U.fullFence(); + // notifyAll to wakeup any threads waiting for this thread to terminate if (notifyAllAfterTerminate) { synchronized (this) { @@ -870,6 +896,10 @@ private void parkOnCarrierThread(boolean timed, long nanos) { */ private void unpark(boolean lazySubmit) { if (!getAndSetParkPermit(true) && currentThread() != this) { + // Full fence (StoreLoad) to ensure parkPermit=true is visible + // before reading state (Dekker pattern with afterYield PARKING + // path which writes state then reads parkPermit). + U.fullFence(); int s = state(); // unparked while parked @@ -912,6 +942,7 @@ void unpark() { private void unblock() { assert !Thread.currentThread().isVirtual(); blockPermit = true; + U.fullFence(); // Full fence (StoreLoad) for Dekker with afterYield BLOCKING path if (state() == BLOCKED && compareAndSetState(BLOCKED, UNBLOCKED)) { submitRunContinuation(); } diff --git a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java index 787d8d5fecd87..558c5ab5c2bc8 100644 --- a/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java +++ b/src/java.base/share/classes/java/util/concurrent/LinkedTransferQueue.java @@ -590,6 +590,11 @@ final Object xfer(Object e, long ns) { q = p.next; if (p.isData != haveData && haveData != (m != null)) { if (p.cmpExItem(m, e) == m) { + // Full fence (StoreLoad) for Dekker with await() which + // writes waiter then reads item. On ARM64, CAS + // (ldaxr/stlxr) + plain load to a different field does + // NOT provide StoreLoad ordering. + VarHandle.fullFence(); Thread w = p.waiter; // matched complementary node if (p != h && h == cmpExHead(h, (q == null) ? p : q)) h.next = h; // advance head; self-link old diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java index 49efe5d5c2c0d..c74a2483aa4b4 100644 --- a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java +++ b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java @@ -177,6 +177,11 @@ final Object xferLifo(Object e, long ns) { else if (p.cmpExItem(m, e) != m) p = head; // missed; restart else { // matched complementary node + // Full fence (StoreLoad) for Dekker with await() which + // writes waiter then reads item. On ARM64, CAS + // (ldaxr/stlxr) + plain load to a different field does + // NOT provide StoreLoad ordering. + VarHandle.fullFence(); Thread w = p.waiter; cmpExHead(p, p.next); LockSupport.unpark(w); diff --git a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java index c077954508341..dfdde38642fd8 100644 --- a/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java +++ b/src/java.base/share/classes/java/util/concurrent/locks/AbstractQueuedSynchronizer.java @@ -782,6 +782,13 @@ final int acquire(Node node, int arg, boolean shared, Thread.onSpinWait(); } else if (node.status == 0) { node.status = WAITING; // enable signal and recheck + // Full fence (StoreLoad) to ensure WAITING status is visible + // before re-reading state in tryAcquire/tryAcquireShared + // (Dekker pattern with releaseShared/release which writes + // state then reads node.status in signalNext). + // On ARM64, volatile write (stlr) + volatile read (ldar) to + // different addresses does NOT provide StoreLoad ordering. + U.fullFence(); } else { spins = postSpins = (byte)((postSpins << 1) | 1); try { @@ -1097,6 +1104,13 @@ public final boolean tryAcquireNanos(int arg, long nanosTimeout) */ public final boolean release(int arg) { if (tryRelease(arg)) { + // Full fence (StoreLoad) to ensure the state update from + // tryRelease is visible before reading node.status in signalNext + // (Dekker pattern: release writes state then reads status, + // acquire writes status then reads state). + // On ARM64, CAS (stlxr/release) + ldar to different addresses + // does NOT provide StoreLoad ordering. + U.fullFence(); signalNext(head); return true; } @@ -1184,6 +1198,8 @@ public final boolean tryAcquireSharedNanos(int arg, long nanosTimeout) */ public final boolean releaseShared(int arg) { if (tryReleaseShared(arg)) { + // Full fence (StoreLoad) — see comment in release() + U.fullFence(); signalNext(head); return true; } From 54015c0fc12ae2fba53f1a9a64661705adc06e03 Mon Sep 17 00:00:00 2001 From: Saint Wesonga Date: Tue, 17 Mar 2026 09:04:40 -0600 Subject: [PATCH 2/5] Undo VirtualThread.java changes --- .../classes/java/lang/VirtualThread.java | 31 ------------------- 1 file changed, 31 deletions(-) diff --git a/src/java.base/share/classes/java/lang/VirtualThread.java b/src/java.base/share/classes/java/lang/VirtualThread.java index f650436a4478a..f058f967b9135 100644 --- a/src/java.base/share/classes/java/lang/VirtualThread.java +++ b/src/java.base/share/classes/java/lang/VirtualThread.java @@ -579,13 +579,6 @@ private void afterYield() { setState(newState = TIMED_PARKED); } - // Full fence (StoreLoad) to ensure the PARKED/TIMED_PARKED state - // is visible before reading parkPermit (Dekker pattern with - // unpark which writes parkPermit then reads state). - // Note: storeFence is insufficient — on ARM64 it only emits - // LoadStore+StoreStore (dmb ishst), not StoreLoad (dmb ish). - U.fullFence(); - // may have been unparked while parking if (parkPermit && compareAndSetState(newState, UNPARKED)) { // lazy submit if local queue is empty @@ -611,10 +604,6 @@ private void afterYield() { if (s == BLOCKING) { setState(BLOCKED); - // Full fence (StoreLoad) for Dekker pattern with unblock - // which writes blockPermit then reads state. - U.fullFence(); - // may have been unblocked while blocking if (blockPermit && compareAndSetState(BLOCKED, UNBLOCKED)) { // lazy submit if local queue is empty @@ -630,9 +619,6 @@ private void afterYield() { boolean interruptible = interruptibleWait; if (s == WAITING) { setState(newState = WAIT); - // Full fence (StoreLoad) for Dekker pattern with notify - // which writes notified then reads state. - U.fullFence(); // may have been notified while in transition blocked = notified && compareAndSetState(WAIT, BLOCKED); } else { @@ -649,9 +635,6 @@ private void afterYield() { byte seqNo = ++timedWaitSeqNo; timeoutTask = schedule(() -> waitTimeoutExpired(seqNo), timeout, MILLISECONDS); setState(newState = TIMED_WAIT); - // Full fence (StoreLoad) for Dekker pattern with notify - // which writes notified then reads state. - U.fullFence(); // May have been notified while in transition. This must be done while // holding the monitor to avoid changing the state of a new timed wait call. blocked = notified && compareAndSetState(TIMED_WAIT, BLOCKED); @@ -692,15 +675,6 @@ private void afterDone(boolean notifyContainer) { assert carrierThread == null; setState(TERMINATED); - // Full fence (StoreLoad) to ensure the TERMINATED state is - // visible before reading notifyAllAfterTerminate (Dekker pattern - // with beforeJoin which writes notifyAllAfterTerminate then - // reads state). Without this, on ARM64 the volatile write of - // state and the subsequent volatile read can be reordered, - // causing a missed-wakeup where both sides miss each other's - // store. - U.fullFence(); - // notifyAll to wakeup any threads waiting for this thread to terminate if (notifyAllAfterTerminate) { synchronized (this) { @@ -896,10 +870,6 @@ private void parkOnCarrierThread(boolean timed, long nanos) { */ private void unpark(boolean lazySubmit) { if (!getAndSetParkPermit(true) && currentThread() != this) { - // Full fence (StoreLoad) to ensure parkPermit=true is visible - // before reading state (Dekker pattern with afterYield PARKING - // path which writes state then reads parkPermit). - U.fullFence(); int s = state(); // unparked while parked @@ -942,7 +912,6 @@ void unpark() { private void unblock() { assert !Thread.currentThread().isVirtual(); blockPermit = true; - U.fullFence(); // Full fence (StoreLoad) for Dekker with afterYield BLOCKING path if (state() == BLOCKED && compareAndSetState(BLOCKED, UNBLOCKED)) { submitRunContinuation(); } From 6b9e14fe83a9885af452c1c57cc1875152419f5d Mon Sep 17 00:00:00 2001 From: Saint Wesonga Date: Tue, 24 Mar 2026 08:30:09 -0600 Subject: [PATCH 3/5] Tip Run 02 --- .../share/classes/java/util/concurrent/SynchronousQueue.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java index c74a2483aa4b4..9f224ae744016 100644 --- a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java +++ b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java @@ -181,6 +181,7 @@ else if (p.cmpExItem(m, e) != m) // writes waiter then reads item. On ARM64, CAS // (ldaxr/stlxr) + plain load to a different field does // NOT provide StoreLoad ordering. + // GHA Run 02 VarHandle.fullFence(); Thread w = p.waiter; cmpExHead(p, p.next); From b88d3098c04a67c2512b2fe521ea3710907c777e Mon Sep 17 00:00:00 2001 From: Saint Wesonga Date: Thu, 26 Mar 2026 08:22:37 -0600 Subject: [PATCH 4/5] Tip Run 03 --- .../share/classes/java/util/concurrent/SynchronousQueue.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java index 9f224ae744016..6a789299da4b3 100644 --- a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java +++ b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java @@ -181,7 +181,7 @@ else if (p.cmpExItem(m, e) != m) // writes waiter then reads item. On ARM64, CAS // (ldaxr/stlxr) + plain load to a different field does // NOT provide StoreLoad ordering. - // GHA Run 02 + // GHA Run 03 VarHandle.fullFence(); Thread w = p.waiter; cmpExHead(p, p.next); From 5063d47258e63b329191fb02bc6f8f4b89af8c9e Mon Sep 17 00:00:00 2001 From: Saint Wesonga Date: Thu, 26 Mar 2026 17:02:10 -0600 Subject: [PATCH 5/5] Tip Run 04 --- .../share/classes/java/util/concurrent/SynchronousQueue.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java index 6a789299da4b3..a9c475899c47e 100644 --- a/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java +++ b/src/java.base/share/classes/java/util/concurrent/SynchronousQueue.java @@ -181,7 +181,7 @@ else if (p.cmpExItem(m, e) != m) // writes waiter then reads item. On ARM64, CAS // (ldaxr/stlxr) + plain load to a different field does // NOT provide StoreLoad ordering. - // GHA Run 03 + // GHA Run 04 VarHandle.fullFence(); Thread w = p.waiter; cmpExHead(p, p.next);