-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathBarrierPopStack.c
More file actions
124 lines (111 loc) · 5.26 KB
/
BarrierPopStack.c
File metadata and controls
124 lines (111 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// Algorithm properties:
//
// * probably invented before (?)
//
// * not many lines of code
//
// * uses only swap/exchange/FAS.
//
// * compact in terms of memory usage
//
// * no TLS or dynamic memory allocation
//
// * the algorithm assumes that the number of participating threads in the system is the same as a barrier consensus
// number. This isn't true in all uses of barriers, however. For instance you might have 20 threads circulating and
// want to hold at a barrier until 10 have arrived and then release those 10. If we have 20 threads in play and the
// barrier consensus "N" is 10, we need to support inter-generational waiting. With a few changes to the algorithm,
// we could support that mode of use. This is always something of an interested topic of debate.
//
// * we could skip the FAS that detaches the list of threads waiting at the barrier and just load it and then store null
// in a a non-atomic fashion. The FAS gives us better asserts, though.
//
// * supports the pthreads requirement that a barrier can be immediately free()ed or destroyed after _any thread returns
// from the barrier.
//
// * local spinning
//
// * amenable to various waiting schemes. We can use Pause(), park-unpark, a futex, or even embed a mutex/condvar pair
// in each of the wait elements and then wait in that fashion. All waking is 1:1.
//
// * On TSO I don't think we need any additional fences. On weaker memory models we'll need fences, but not many.
//
// * The progress properties of waiting for the Ordinal to resolve don't thrill me. That waiting step is somewhat
// analogous to that waiting loop we have in the MCS unlock() operator.
//
// * the wakeup sequence is currently LIFO. That's doesn't bother me. But if that was a concern, we could easily
// propagate the address of the head through the list in the same we currently propagate the ordinal values and also
// add explicit wait element links ("Next" pointers) that flow from the head towards the tail. This doesn't add much
// overhead above and beyond the existing propagation of the Ordinal value. When we trigger the barrier, we'd then
// start at the head and propagate notification toward the tail - the reverse direction from what we currently have.
// That is, with just a bit more effort, we could have FIFO/FCFS wakeup ordering at likely almost no impact on
// performance. I don't have strong feeling on the wakeup order.
//
// * The worst-case wakeup time could be pretty bad, as the notifications flow through the detached chain with each
// thread waking its predecessor in turn. Things get ugly if we're blocking our waiting threads in the kernel, as it
// can take a long time to reach those threads buried deep on the stack, given unpark latencies. It's possible,
// though, to re-form the list into a binary wakeup tree. There are also some helping tricks available, but all that
// I know of involve firing potentially redundant unpark() operations that might in turn cause spurious wakeups.
// That's fine in the world of park-unpark, but possibly not acceptable in your environment.
// Inspired by CLH in that there's no explicit lists of threads, and a waiting thread knows only about its immediate
// successor, whereas MCS, has "next" pointers in memory. But it is unlike CLH in that it waits on a field in the
// queued element, instead of the previous element.
#include "BarrierCallback.h"
typedef struct CALIGN waitelement {
VTYPE Ordinal;
VTYPE Gate;
} WaitElement;
typedef struct {
TYPE CALIGN group;
WaitElement * barrier;
CBDECL();
} Barrier;
static TYPE PAD1 CALIGN __attribute__(( unused )); // protect further false sharing
static Barrier b CALIGN;
static TYPE PAD2 CALIGN __attribute__(( unused )); // protect further false sharing
#define BARRIER_DECL
#define BARRIER_CALL block( &b );
static inline bool block( Barrier * b ) {
CBSTART(); // must be first
WaitElement W = { .Gate = false, .Ordinal = 0 }; // mark as not yet resolved
WaitElement * pred = Fas( b->barrier, &W );
assert( pred != &W );
if ( pred != NULL ) {
await( pred->Ordinal != 0 ); // wait for predecessor's count to resolve
W.Ordinal = pred->Ordinal + 1;
} else {
W.Ordinal = 1 ;
} // if
assert( W.Ordinal != 0 );
if ( LIKELY( W.Ordinal < b->group ) ) { // intermediate thread ?
await( W.Gate ); // primary waiting loop
if ( pred != NULL ) {
assert( ! pred->Gate );
pred->Gate = true; // propagate notification through the stack
} // if
return false;
} // if
CBEND(); // must appear in safe location
#ifdef NDEBUG
b->barrier = NULL;
#else
WaitElement * DetachedList = Fas( *b, NULL );
assert( DetachedList == &W );
assert( ! DetachedList->Gate );
#endif
if ( pred != NULL ) {
assert( ! pred->Gate );
pred->Gate = true; // propagate notification through the stack
} // if
return true;
} // block
#include "BarrierWorker.c"
void __attribute__((noinline)) ctor() {
worker_ctor();
b = (Barrier){ .group = N, .barrier = NULL CBINIT() };
} // ctor
void __attribute__((noinline)) dtor() {
worker_dtor();
} // dtor
// Local Variables: //
// compile-command: "gcc -Wall -Wextra -std=gnu11 -O3 -DNDEBUG -fno-reorder-functions -DPIN -DAlgorithm=BarrierPopStack Harness.c -lpthread -lm -D`hostname` -DCFMT" //
// End: //