1 | /* |
2 | * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. |
3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
4 | * |
5 | * This code is free software; you can redistribute it and/or modify it |
6 | * under the terms of the GNU General Public License version 2 only, as |
7 | * published by the Free Software Foundation. |
8 | * |
9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
12 | * version 2 for more details (a copy is included in the LICENSE file that |
13 | * accompanied this code). |
14 | * |
15 | * You should have received a copy of the GNU General Public License version |
16 | * 2 along with this work; if not, write to the Free Software Foundation, |
17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
18 | * |
19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
20 | * or visit www.oracle.com if you need additional information or have any |
21 | * questions. |
22 | * |
23 | */ |
24 | |
25 | #ifndef SHARE_RUNTIME_ORDERACCESS_HPP |
26 | #define SHARE_RUNTIME_ORDERACCESS_HPP |
27 | |
28 | #include "memory/allocation.hpp" |
29 | #include "runtime/atomic.hpp" |
30 | #include "utilities/macros.hpp" |
31 | |
32 | // Memory Access Ordering Model |
33 | // |
34 | // This interface is based on the JSR-133 Cookbook for Compiler Writers. |
35 | // |
36 | // In the following, the terms 'previous', 'subsequent', 'before', |
37 | // 'after', 'preceding' and 'succeeding' refer to program order. The |
38 | // terms 'down' and 'below' refer to forward load or store motion |
39 | // relative to program order, while 'up' and 'above' refer to backward |
40 | // motion. |
41 | // |
42 | // We define four primitive memory barrier operations. |
43 | // |
44 | // LoadLoad: Load1(s); LoadLoad; Load2 |
45 | // |
46 | // Ensures that Load1 completes (obtains the value it loads from memory) |
47 | // before Load2 and any subsequent load operations. Loads before Load1 |
48 | // may *not* float below Load2 and any subsequent load operations. |
49 | // |
50 | // StoreStore: Store1(s); StoreStore; Store2 |
51 | // |
52 | // Ensures that Store1 completes (the effect on memory of Store1 is made |
53 | // visible to other processors) before Store2 and any subsequent store |
54 | // operations. Stores before Store1 may *not* float below Store2 and any |
55 | // subsequent store operations. |
56 | // |
57 | // LoadStore: Load1(s); LoadStore; Store2 |
58 | // |
59 | // Ensures that Load1 completes before Store2 and any subsequent store |
60 | // operations. Loads before Load1 may *not* float below Store2 and any |
61 | // subsequent store operations. |
62 | // |
63 | // StoreLoad: Store1(s); StoreLoad; Load2 |
64 | // |
65 | // Ensures that Store1 completes before Load2 and any subsequent load |
66 | // operations. Stores before Store1 may *not* float below Load2 and any |
67 | // subsequent load operations. |
68 | // |
69 | // We define two further barriers: acquire and release. |
70 | // |
71 | // Conceptually, acquire/release semantics form unidirectional and |
72 | // asynchronous barriers w.r.t. a synchronizing load(X) and store(X) pair. |
73 | // They should always be used in pairs to publish (release store) and |
74 | // access (load acquire) some implicitly understood shared data between |
75 | // threads in a relatively cheap fashion not requiring storeload. If not |
76 | // used in such a pair, it is advised to use a membar instead: |
77 | // acquire/release only make sense as pairs. |
78 | // |
79 | // T1: access_shared_data |
80 | // T1: ]release |
81 | // T1: (...) |
82 | // T1: store(X) |
83 | // |
84 | // T2: load(X) |
85 | // T2: (...) |
86 | // T2: acquire[ |
87 | // T2: access_shared_data |
88 | // |
89 | // It is guaranteed that if T2: load(X) synchronizes with (observes the |
90 | // value written by) T1: store(X), then the memory accesses before the T1: |
91 | // ]release happen before the memory accesses after the T2: acquire[. |
92 | // |
93 | // Total Store Order (TSO) machines can be seen as machines issuing a |
94 | // release store for each store and a load acquire for each load. Therefore |
95 | // there is an inherent resemblence between TSO and acquire/release |
96 | // semantics. TSO can be seen as an abstract machine where loads are |
97 | // executed immediately when encountered (hence loadload reordering not |
98 | // happening) but enqueues stores in a FIFO queue |
99 | // for asynchronous serialization (neither storestore or loadstore |
100 | // reordering happening). The only reordering happening is storeload due to |
101 | // the queue asynchronously serializing stores (yet in order). |
102 | // |
103 | // Acquire/release semantics essentially exploits this asynchronicity: when |
104 | // the load(X) acquire[ observes the store of ]release store(X), the |
105 | // accesses before the release must have happened before the accesses after |
106 | // acquire. |
107 | // |
108 | // The API offers both stand-alone acquire() and release() as well as bound |
109 | // load_acquire() and release_store(). It is guaranteed that these are |
110 | // semantically equivalent w.r.t. the defined model. However, since |
111 | // stand-alone acquire()/release() does not know which previous |
112 | // load/subsequent store is considered the synchronizing load/store, they |
113 | // may be more conservative in implementations. We advise using the bound |
114 | // variants whenever possible. |
115 | // |
116 | // Finally, we define a "fence" operation, as a bidirectional barrier. |
117 | // It guarantees that any memory access preceding the fence is not |
118 | // reordered w.r.t. any memory accesses subsequent to the fence in program |
119 | // order. This may be used to prevent sequences of loads from floating up |
120 | // above sequences of stores. |
121 | // |
122 | // The following table shows the implementations on some architectures: |
123 | // |
124 | // Constraint x86 sparc TSO ppc |
125 | // --------------------------------------------------------------------------- |
126 | // fence LoadStore | lock membar #StoreLoad sync |
127 | // StoreStore | addl 0,(sp) |
128 | // LoadLoad | |
129 | // StoreLoad |
130 | // |
131 | // release LoadStore | lwsync |
132 | // StoreStore |
133 | // |
134 | // acquire LoadLoad | lwsync |
135 | // LoadStore |
136 | // |
137 | // release_store <store> <store> lwsync |
138 | // <store> |
139 | // |
140 | // release_store_fence xchg <store> lwsync |
141 | // membar #StoreLoad <store> |
142 | // sync |
143 | // |
144 | // |
145 | // load_acquire <load> <load> <load> |
146 | // lwsync |
147 | // |
148 | // Ordering a load relative to preceding stores requires a StoreLoad, |
149 | // which implies a membar #StoreLoad between the store and load under |
150 | // sparc-TSO. On x86, we use explicitly locked add. |
151 | // |
152 | // Conventional usage is to issue a load_acquire for ordered loads. Use |
153 | // release_store for ordered stores when you care only that prior stores |
154 | // are visible before the release_store, but don't care exactly when the |
155 | // store associated with the release_store becomes visible. Use |
156 | // release_store_fence to update values like the thread state, where we |
157 | // don't want the current thread to continue until all our prior memory |
158 | // accesses (including the new thread state) are visible to other threads. |
159 | // This is equivalent to the volatile semantics of the Java Memory Model. |
160 | // |
161 | // C++ Volatile Semantics |
162 | // |
163 | // C++ volatile semantics prevent compiler re-ordering between |
164 | // volatile memory accesses. However, reordering between non-volatile |
165 | // and volatile memory accesses is in general undefined. For compiler |
166 | // reordering constraints taking non-volatile memory accesses into |
167 | // consideration, a compiler barrier has to be used instead. Some |
168 | // compiler implementations may choose to enforce additional |
169 | // constraints beyond those required by the language. Note also that |
170 | // both volatile semantics and compiler barrier do not prevent |
171 | // hardware reordering. |
172 | // |
173 | // os::is_MP Considered Redundant |
174 | // |
175 | // Callers of this interface do not need to test os::is_MP() before |
176 | // issuing an operation. The test is taken care of by the implementation |
177 | // of the interface (depending on the vm version and platform, the test |
178 | // may or may not be actually done by the implementation). |
179 | // |
180 | // |
181 | // A Note on Memory Ordering and Cache Coherency |
182 | // |
183 | // Cache coherency and memory ordering are orthogonal concepts, though they |
184 | // interact. E.g., all existing itanium machines are cache-coherent, but |
185 | // the hardware can freely reorder loads wrt other loads unless it sees a |
186 | // load-acquire instruction. All existing sparc machines are cache-coherent |
187 | // and, unlike itanium, TSO guarantees that the hardware orders loads wrt |
188 | // loads and stores, and stores wrt to each other. |
189 | // |
190 | // Consider the implementation of loadload. *If* your platform *isn't* |
191 | // cache-coherent, then loadload must not only prevent hardware load |
192 | // instruction reordering, but it must *also* ensure that subsequent |
193 | // loads from addresses that could be written by other processors (i.e., |
194 | // that are broadcast by other processors) go all the way to the first |
195 | // level of memory shared by those processors and the one issuing |
196 | // the loadload. |
197 | // |
198 | // So if we have a MP that has, say, a per-processor D$ that doesn't see |
199 | // writes by other processors, and has a shared E$ that does, the loadload |
200 | // barrier would have to make sure that either |
201 | // |
202 | // 1. cache lines in the issuing processor's D$ that contained data from |
203 | // addresses that could be written by other processors are invalidated, so |
204 | // subsequent loads from those addresses go to the E$, (it could do this |
205 | // by tagging such cache lines as 'shared', though how to tell the hardware |
206 | // to do the tagging is an interesting problem), or |
207 | // |
208 | // 2. there never are such cache lines in the issuing processor's D$, which |
209 | // means all references to shared data (however identified: see above) |
210 | // bypass the D$ (i.e., are satisfied from the E$). |
211 | // |
212 | // If your machine doesn't have an E$, substitute 'main memory' for 'E$'. |
213 | // |
214 | // Either of these alternatives is a pain, so no current machine we know of |
215 | // has incoherent caches. |
216 | // |
217 | // If loadload didn't have these properties, the store-release sequence for |
218 | // publishing a shared data structure wouldn't work, because a processor |
219 | // trying to read data newly published by another processor might go to |
220 | // its own incoherent caches to satisfy the read instead of to the newly |
221 | // written shared memory. |
222 | // |
223 | // |
224 | // NOTE WELL!! |
225 | // |
226 | // A Note on MutexLocker and Friends |
227 | // |
228 | // See mutexLocker.hpp. We assume throughout the VM that MutexLocker's |
229 | // and friends' constructors do a fence, a lock and an acquire *in that |
230 | // order*. And that their destructors do a release and unlock, in *that* |
231 | // order. If their implementations change such that these assumptions |
232 | // are violated, a whole lot of code will break. |
233 | |
234 | enum ScopedFenceType { |
235 | X_ACQUIRE |
236 | , RELEASE_X |
237 | , RELEASE_X_FENCE |
238 | }; |
239 | |
240 | template <ScopedFenceType T> |
241 | class ScopedFenceGeneral: public StackObj { |
242 | public: |
243 | void prefix() {} |
244 | void postfix() {} |
245 | }; |
246 | |
247 | template <ScopedFenceType T> |
248 | class ScopedFence : public ScopedFenceGeneral<T> { |
249 | void *const _field; |
250 | public: |
251 | ScopedFence(void *const field) : _field(field) { prefix(); } |
252 | ~ScopedFence() { postfix(); } |
253 | void prefix() { ScopedFenceGeneral<T>::prefix(); } |
254 | void postfix() { ScopedFenceGeneral<T>::postfix(); } |
255 | }; |
256 | |
257 | class OrderAccess : private Atomic { |
258 | public: |
259 | // barriers |
260 | static void loadload(); |
261 | static void storestore(); |
262 | static void loadstore(); |
263 | static void storeload(); |
264 | |
265 | static void acquire(); |
266 | static void release(); |
267 | static void fence(); |
268 | |
269 | static void cross_modify_fence(); |
270 | |
271 | template <typename T> |
272 | static T load_acquire(const volatile T* p); |
273 | |
274 | template <typename T, typename D> |
275 | static void release_store(volatile D* p, T v); |
276 | |
277 | template <typename T, typename D> |
278 | static void release_store_fence(volatile D* p, T v); |
279 | |
280 | private: |
281 | // This is a helper that invokes the StubRoutines::fence_entry() |
282 | // routine if it exists, It should only be used by platforms that |
283 | // don't have another way to do the inline assembly. |
284 | static void StubRoutines_fence(); |
285 | |
286 | // Give platforms a variation point to specialize. |
287 | template<size_t byte_size, ScopedFenceType type> struct PlatformOrderedStore; |
288 | template<size_t byte_size, ScopedFenceType type> struct PlatformOrderedLoad; |
289 | |
290 | template<typename FieldType, ScopedFenceType FenceType> |
291 | static void ordered_store(volatile FieldType* p, FieldType v); |
292 | |
293 | template<typename FieldType, ScopedFenceType FenceType> |
294 | static FieldType ordered_load(const volatile FieldType* p); |
295 | }; |
296 | |
297 | // The following methods can be specialized using simple template specialization |
298 | // in the platform specific files for optimization purposes. Otherwise the |
299 | // generalized variant is used. |
300 | |
301 | template<size_t byte_size, ScopedFenceType type> |
302 | struct OrderAccess::PlatformOrderedStore { |
303 | template <typename T> |
304 | void operator()(T v, volatile T* p) const { |
305 | ordered_store<T, type>(p, v); |
306 | } |
307 | }; |
308 | |
309 | template<size_t byte_size, ScopedFenceType type> |
310 | struct OrderAccess::PlatformOrderedLoad { |
311 | template <typename T> |
312 | T operator()(const volatile T* p) const { |
313 | return ordered_load<T, type>(p); |
314 | } |
315 | }; |
316 | |
317 | #include OS_CPU_HEADER(orderAccess) |
318 | |
319 | template<> inline void ScopedFenceGeneral<X_ACQUIRE>::postfix() { OrderAccess::acquire(); } |
320 | template<> inline void ScopedFenceGeneral<RELEASE_X>::prefix() { OrderAccess::release(); } |
321 | template<> inline void ScopedFenceGeneral<RELEASE_X_FENCE>::prefix() { OrderAccess::release(); } |
322 | template<> inline void ScopedFenceGeneral<RELEASE_X_FENCE>::postfix() { OrderAccess::fence(); } |
323 | |
324 | |
325 | template <typename FieldType, ScopedFenceType FenceType> |
326 | inline void OrderAccess::ordered_store(volatile FieldType* p, FieldType v) { |
327 | ScopedFence<FenceType> f((void*)p); |
328 | Atomic::store(v, p); |
329 | } |
330 | |
331 | template <typename FieldType, ScopedFenceType FenceType> |
332 | inline FieldType OrderAccess::ordered_load(const volatile FieldType* p) { |
333 | ScopedFence<FenceType> f((void*)p); |
334 | return Atomic::load(p); |
335 | } |
336 | |
337 | template <typename T> |
338 | inline T OrderAccess::load_acquire(const volatile T* p) { |
339 | return LoadImpl<T, PlatformOrderedLoad<sizeof(T), X_ACQUIRE> >()(p); |
340 | } |
341 | |
342 | template <typename T, typename D> |
343 | inline void OrderAccess::release_store(volatile D* p, T v) { |
344 | StoreImpl<T, D, PlatformOrderedStore<sizeof(D), RELEASE_X> >()(v, p); |
345 | } |
346 | |
347 | template <typename T, typename D> |
348 | inline void OrderAccess::release_store_fence(volatile D* p, T v) { |
349 | StoreImpl<T, D, PlatformOrderedStore<sizeof(D), RELEASE_X_FENCE> >()(v, p); |
350 | } |
351 | #endif // SHARE_RUNTIME_ORDERACCESS_HPP |
352 | |