| 1 | /* |
| 2 | * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. |
| 3 | * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
| 4 | * |
| 5 | * This code is free software; you can redistribute it and/or modify it |
| 6 | * under the terms of the GNU General Public License version 2 only, as |
| 7 | * published by the Free Software Foundation. |
| 8 | * |
| 9 | * This code is distributed in the hope that it will be useful, but WITHOUT |
| 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
| 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
| 12 | * version 2 for more details (a copy is included in the LICENSE file that |
| 13 | * accompanied this code). |
| 14 | * |
| 15 | * You should have received a copy of the GNU General Public License version |
| 16 | * 2 along with this work; if not, write to the Free Software Foundation, |
| 17 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
| 18 | * |
| 19 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
| 20 | * or visit www.oracle.com if you need additional information or have any |
| 21 | * questions. |
| 22 | * |
| 23 | */ |
| 24 | |
| 25 | #ifndef SHARE_RUNTIME_ORDERACCESS_HPP |
| 26 | #define SHARE_RUNTIME_ORDERACCESS_HPP |
| 27 | |
| 28 | #include "memory/allocation.hpp" |
| 29 | #include "runtime/atomic.hpp" |
| 30 | #include "utilities/macros.hpp" |
| 31 | |
| 32 | // Memory Access Ordering Model |
| 33 | // |
| 34 | // This interface is based on the JSR-133 Cookbook for Compiler Writers. |
| 35 | // |
| 36 | // In the following, the terms 'previous', 'subsequent', 'before', |
| 37 | // 'after', 'preceding' and 'succeeding' refer to program order. The |
| 38 | // terms 'down' and 'below' refer to forward load or store motion |
| 39 | // relative to program order, while 'up' and 'above' refer to backward |
| 40 | // motion. |
| 41 | // |
| 42 | // We define four primitive memory barrier operations. |
| 43 | // |
| 44 | // LoadLoad: Load1(s); LoadLoad; Load2 |
| 45 | // |
| 46 | // Ensures that Load1 completes (obtains the value it loads from memory) |
| 47 | // before Load2 and any subsequent load operations. Loads before Load1 |
| 48 | // may *not* float below Load2 and any subsequent load operations. |
| 49 | // |
| 50 | // StoreStore: Store1(s); StoreStore; Store2 |
| 51 | // |
| 52 | // Ensures that Store1 completes (the effect on memory of Store1 is made |
| 53 | // visible to other processors) before Store2 and any subsequent store |
| 54 | // operations. Stores before Store1 may *not* float below Store2 and any |
| 55 | // subsequent store operations. |
| 56 | // |
| 57 | // LoadStore: Load1(s); LoadStore; Store2 |
| 58 | // |
| 59 | // Ensures that Load1 completes before Store2 and any subsequent store |
| 60 | // operations. Loads before Load1 may *not* float below Store2 and any |
| 61 | // subsequent store operations. |
| 62 | // |
| 63 | // StoreLoad: Store1(s); StoreLoad; Load2 |
| 64 | // |
| 65 | // Ensures that Store1 completes before Load2 and any subsequent load |
| 66 | // operations. Stores before Store1 may *not* float below Load2 and any |
| 67 | // subsequent load operations. |
| 68 | // |
| 69 | // We define two further barriers: acquire and release. |
| 70 | // |
| 71 | // Conceptually, acquire/release semantics form unidirectional and |
| 72 | // asynchronous barriers w.r.t. a synchronizing load(X) and store(X) pair. |
| 73 | // They should always be used in pairs to publish (release store) and |
| 74 | // access (load acquire) some implicitly understood shared data between |
| 75 | // threads in a relatively cheap fashion not requiring storeload. If not |
| 76 | // used in such a pair, it is advised to use a membar instead: |
| 77 | // acquire/release only make sense as pairs. |
| 78 | // |
| 79 | // T1: access_shared_data |
| 80 | // T1: ]release |
| 81 | // T1: (...) |
| 82 | // T1: store(X) |
| 83 | // |
| 84 | // T2: load(X) |
| 85 | // T2: (...) |
| 86 | // T2: acquire[ |
| 87 | // T2: access_shared_data |
| 88 | // |
| 89 | // It is guaranteed that if T2: load(X) synchronizes with (observes the |
| 90 | // value written by) T1: store(X), then the memory accesses before the T1: |
| 91 | // ]release happen before the memory accesses after the T2: acquire[. |
| 92 | // |
| 93 | // Total Store Order (TSO) machines can be seen as machines issuing a |
| 94 | // release store for each store and a load acquire for each load. Therefore |
| 95 | // there is an inherent resemblence between TSO and acquire/release |
| 96 | // semantics. TSO can be seen as an abstract machine where loads are |
| 97 | // executed immediately when encountered (hence loadload reordering not |
| 98 | // happening) but enqueues stores in a FIFO queue |
| 99 | // for asynchronous serialization (neither storestore or loadstore |
| 100 | // reordering happening). The only reordering happening is storeload due to |
| 101 | // the queue asynchronously serializing stores (yet in order). |
| 102 | // |
| 103 | // Acquire/release semantics essentially exploits this asynchronicity: when |
| 104 | // the load(X) acquire[ observes the store of ]release store(X), the |
| 105 | // accesses before the release must have happened before the accesses after |
| 106 | // acquire. |
| 107 | // |
| 108 | // The API offers both stand-alone acquire() and release() as well as bound |
| 109 | // load_acquire() and release_store(). It is guaranteed that these are |
| 110 | // semantically equivalent w.r.t. the defined model. However, since |
| 111 | // stand-alone acquire()/release() does not know which previous |
| 112 | // load/subsequent store is considered the synchronizing load/store, they |
| 113 | // may be more conservative in implementations. We advise using the bound |
| 114 | // variants whenever possible. |
| 115 | // |
| 116 | // Finally, we define a "fence" operation, as a bidirectional barrier. |
| 117 | // It guarantees that any memory access preceding the fence is not |
| 118 | // reordered w.r.t. any memory accesses subsequent to the fence in program |
| 119 | // order. This may be used to prevent sequences of loads from floating up |
| 120 | // above sequences of stores. |
| 121 | // |
| 122 | // The following table shows the implementations on some architectures: |
| 123 | // |
| 124 | // Constraint x86 sparc TSO ppc |
| 125 | // --------------------------------------------------------------------------- |
| 126 | // fence LoadStore | lock membar #StoreLoad sync |
| 127 | // StoreStore | addl 0,(sp) |
| 128 | // LoadLoad | |
| 129 | // StoreLoad |
| 130 | // |
| 131 | // release LoadStore | lwsync |
| 132 | // StoreStore |
| 133 | // |
| 134 | // acquire LoadLoad | lwsync |
| 135 | // LoadStore |
| 136 | // |
| 137 | // release_store <store> <store> lwsync |
| 138 | // <store> |
| 139 | // |
| 140 | // release_store_fence xchg <store> lwsync |
| 141 | // membar #StoreLoad <store> |
| 142 | // sync |
| 143 | // |
| 144 | // |
| 145 | // load_acquire <load> <load> <load> |
| 146 | // lwsync |
| 147 | // |
| 148 | // Ordering a load relative to preceding stores requires a StoreLoad, |
| 149 | // which implies a membar #StoreLoad between the store and load under |
| 150 | // sparc-TSO. On x86, we use explicitly locked add. |
| 151 | // |
| 152 | // Conventional usage is to issue a load_acquire for ordered loads. Use |
| 153 | // release_store for ordered stores when you care only that prior stores |
| 154 | // are visible before the release_store, but don't care exactly when the |
| 155 | // store associated with the release_store becomes visible. Use |
| 156 | // release_store_fence to update values like the thread state, where we |
| 157 | // don't want the current thread to continue until all our prior memory |
| 158 | // accesses (including the new thread state) are visible to other threads. |
| 159 | // This is equivalent to the volatile semantics of the Java Memory Model. |
| 160 | // |
| 161 | // C++ Volatile Semantics |
| 162 | // |
| 163 | // C++ volatile semantics prevent compiler re-ordering between |
| 164 | // volatile memory accesses. However, reordering between non-volatile |
| 165 | // and volatile memory accesses is in general undefined. For compiler |
| 166 | // reordering constraints taking non-volatile memory accesses into |
| 167 | // consideration, a compiler barrier has to be used instead. Some |
| 168 | // compiler implementations may choose to enforce additional |
| 169 | // constraints beyond those required by the language. Note also that |
| 170 | // both volatile semantics and compiler barrier do not prevent |
| 171 | // hardware reordering. |
| 172 | // |
| 173 | // os::is_MP Considered Redundant |
| 174 | // |
| 175 | // Callers of this interface do not need to test os::is_MP() before |
| 176 | // issuing an operation. The test is taken care of by the implementation |
| 177 | // of the interface (depending on the vm version and platform, the test |
| 178 | // may or may not be actually done by the implementation). |
| 179 | // |
| 180 | // |
| 181 | // A Note on Memory Ordering and Cache Coherency |
| 182 | // |
| 183 | // Cache coherency and memory ordering are orthogonal concepts, though they |
| 184 | // interact. E.g., all existing itanium machines are cache-coherent, but |
| 185 | // the hardware can freely reorder loads wrt other loads unless it sees a |
| 186 | // load-acquire instruction. All existing sparc machines are cache-coherent |
| 187 | // and, unlike itanium, TSO guarantees that the hardware orders loads wrt |
| 188 | // loads and stores, and stores wrt to each other. |
| 189 | // |
| 190 | // Consider the implementation of loadload. *If* your platform *isn't* |
| 191 | // cache-coherent, then loadload must not only prevent hardware load |
| 192 | // instruction reordering, but it must *also* ensure that subsequent |
| 193 | // loads from addresses that could be written by other processors (i.e., |
| 194 | // that are broadcast by other processors) go all the way to the first |
| 195 | // level of memory shared by those processors and the one issuing |
| 196 | // the loadload. |
| 197 | // |
| 198 | // So if we have a MP that has, say, a per-processor D$ that doesn't see |
| 199 | // writes by other processors, and has a shared E$ that does, the loadload |
| 200 | // barrier would have to make sure that either |
| 201 | // |
| 202 | // 1. cache lines in the issuing processor's D$ that contained data from |
| 203 | // addresses that could be written by other processors are invalidated, so |
| 204 | // subsequent loads from those addresses go to the E$, (it could do this |
| 205 | // by tagging such cache lines as 'shared', though how to tell the hardware |
| 206 | // to do the tagging is an interesting problem), or |
| 207 | // |
| 208 | // 2. there never are such cache lines in the issuing processor's D$, which |
| 209 | // means all references to shared data (however identified: see above) |
| 210 | // bypass the D$ (i.e., are satisfied from the E$). |
| 211 | // |
| 212 | // If your machine doesn't have an E$, substitute 'main memory' for 'E$'. |
| 213 | // |
| 214 | // Either of these alternatives is a pain, so no current machine we know of |
| 215 | // has incoherent caches. |
| 216 | // |
| 217 | // If loadload didn't have these properties, the store-release sequence for |
| 218 | // publishing a shared data structure wouldn't work, because a processor |
| 219 | // trying to read data newly published by another processor might go to |
| 220 | // its own incoherent caches to satisfy the read instead of to the newly |
| 221 | // written shared memory. |
| 222 | // |
| 223 | // |
| 224 | // NOTE WELL!! |
| 225 | // |
| 226 | // A Note on MutexLocker and Friends |
| 227 | // |
| 228 | // See mutexLocker.hpp. We assume throughout the VM that MutexLocker's |
| 229 | // and friends' constructors do a fence, a lock and an acquire *in that |
| 230 | // order*. And that their destructors do a release and unlock, in *that* |
| 231 | // order. If their implementations change such that these assumptions |
| 232 | // are violated, a whole lot of code will break. |
| 233 | |
| 234 | enum ScopedFenceType { |
| 235 | X_ACQUIRE |
| 236 | , RELEASE_X |
| 237 | , RELEASE_X_FENCE |
| 238 | }; |
| 239 | |
| 240 | template <ScopedFenceType T> |
| 241 | class ScopedFenceGeneral: public StackObj { |
| 242 | public: |
| 243 | void prefix() {} |
| 244 | void postfix() {} |
| 245 | }; |
| 246 | |
| 247 | template <ScopedFenceType T> |
| 248 | class ScopedFence : public ScopedFenceGeneral<T> { |
| 249 | void *const _field; |
| 250 | public: |
| 251 | ScopedFence(void *const field) : _field(field) { prefix(); } |
| 252 | ~ScopedFence() { postfix(); } |
| 253 | void prefix() { ScopedFenceGeneral<T>::prefix(); } |
| 254 | void postfix() { ScopedFenceGeneral<T>::postfix(); } |
| 255 | }; |
| 256 | |
| 257 | class OrderAccess : private Atomic { |
| 258 | public: |
| 259 | // barriers |
| 260 | static void loadload(); |
| 261 | static void storestore(); |
| 262 | static void loadstore(); |
| 263 | static void storeload(); |
| 264 | |
| 265 | static void acquire(); |
| 266 | static void release(); |
| 267 | static void fence(); |
| 268 | |
| 269 | static void cross_modify_fence(); |
| 270 | |
| 271 | template <typename T> |
| 272 | static T load_acquire(const volatile T* p); |
| 273 | |
| 274 | template <typename T, typename D> |
| 275 | static void release_store(volatile D* p, T v); |
| 276 | |
| 277 | template <typename T, typename D> |
| 278 | static void release_store_fence(volatile D* p, T v); |
| 279 | |
| 280 | private: |
| 281 | // This is a helper that invokes the StubRoutines::fence_entry() |
| 282 | // routine if it exists, It should only be used by platforms that |
| 283 | // don't have another way to do the inline assembly. |
| 284 | static void StubRoutines_fence(); |
| 285 | |
| 286 | // Give platforms a variation point to specialize. |
| 287 | template<size_t byte_size, ScopedFenceType type> struct PlatformOrderedStore; |
| 288 | template<size_t byte_size, ScopedFenceType type> struct PlatformOrderedLoad; |
| 289 | |
| 290 | template<typename FieldType, ScopedFenceType FenceType> |
| 291 | static void ordered_store(volatile FieldType* p, FieldType v); |
| 292 | |
| 293 | template<typename FieldType, ScopedFenceType FenceType> |
| 294 | static FieldType ordered_load(const volatile FieldType* p); |
| 295 | }; |
| 296 | |
| 297 | // The following methods can be specialized using simple template specialization |
| 298 | // in the platform specific files for optimization purposes. Otherwise the |
| 299 | // generalized variant is used. |
| 300 | |
| 301 | template<size_t byte_size, ScopedFenceType type> |
| 302 | struct OrderAccess::PlatformOrderedStore { |
| 303 | template <typename T> |
| 304 | void operator()(T v, volatile T* p) const { |
| 305 | ordered_store<T, type>(p, v); |
| 306 | } |
| 307 | }; |
| 308 | |
| 309 | template<size_t byte_size, ScopedFenceType type> |
| 310 | struct OrderAccess::PlatformOrderedLoad { |
| 311 | template <typename T> |
| 312 | T operator()(const volatile T* p) const { |
| 313 | return ordered_load<T, type>(p); |
| 314 | } |
| 315 | }; |
| 316 | |
| 317 | #include OS_CPU_HEADER(orderAccess) |
| 318 | |
| 319 | template<> inline void ScopedFenceGeneral<X_ACQUIRE>::postfix() { OrderAccess::acquire(); } |
| 320 | template<> inline void ScopedFenceGeneral<RELEASE_X>::prefix() { OrderAccess::release(); } |
| 321 | template<> inline void ScopedFenceGeneral<RELEASE_X_FENCE>::prefix() { OrderAccess::release(); } |
| 322 | template<> inline void ScopedFenceGeneral<RELEASE_X_FENCE>::postfix() { OrderAccess::fence(); } |
| 323 | |
| 324 | |
| 325 | template <typename FieldType, ScopedFenceType FenceType> |
| 326 | inline void OrderAccess::ordered_store(volatile FieldType* p, FieldType v) { |
| 327 | ScopedFence<FenceType> f((void*)p); |
| 328 | Atomic::store(v, p); |
| 329 | } |
| 330 | |
| 331 | template <typename FieldType, ScopedFenceType FenceType> |
| 332 | inline FieldType OrderAccess::ordered_load(const volatile FieldType* p) { |
| 333 | ScopedFence<FenceType> f((void*)p); |
| 334 | return Atomic::load(p); |
| 335 | } |
| 336 | |
| 337 | template <typename T> |
| 338 | inline T OrderAccess::load_acquire(const volatile T* p) { |
| 339 | return LoadImpl<T, PlatformOrderedLoad<sizeof(T), X_ACQUIRE> >()(p); |
| 340 | } |
| 341 | |
| 342 | template <typename T, typename D> |
| 343 | inline void OrderAccess::release_store(volatile D* p, T v) { |
| 344 | StoreImpl<T, D, PlatformOrderedStore<sizeof(D), RELEASE_X> >()(v, p); |
| 345 | } |
| 346 | |
| 347 | template <typename T, typename D> |
| 348 | inline void OrderAccess::release_store_fence(volatile D* p, T v) { |
| 349 | StoreImpl<T, D, PlatformOrderedStore<sizeof(D), RELEASE_X_FENCE> >()(v, p); |
| 350 | } |
| 351 | #endif // SHARE_RUNTIME_ORDERACCESS_HPP |
| 352 | |