1 | /* |
2 | Copyright (c) 2005-2019 Intel Corporation |
3 | |
4 | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | you may not use this file except in compliance with the License. |
6 | You may obtain a copy of the License at |
7 | |
8 | http://www.apache.org/licenses/LICENSE-2.0 |
9 | |
10 | Unless required by applicable law or agreed to in writing, software |
11 | distributed under the License is distributed on an "AS IS" BASIS, |
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | See the License for the specific language governing permissions and |
14 | limitations under the License. |
15 | */ |
16 | |
17 | #ifndef __TBB_machine_H |
18 | #define __TBB_machine_H |
19 | |
20 | /** This header provides basic platform abstraction layer by hooking up appropriate |
21 | architecture/OS/compiler specific headers from the /include/tbb/machine directory. |
22 | If a plug-in header does not implement all the required APIs, it must specify |
23 | the missing ones by setting one or more of the following macros: |
24 | |
25 | __TBB_USE_GENERIC_PART_WORD_CAS |
26 | __TBB_USE_GENERIC_PART_WORD_FETCH_ADD |
27 | __TBB_USE_GENERIC_PART_WORD_FETCH_STORE |
28 | __TBB_USE_GENERIC_FETCH_ADD |
29 | __TBB_USE_GENERIC_FETCH_STORE |
30 | __TBB_USE_GENERIC_DWORD_FETCH_ADD |
31 | __TBB_USE_GENERIC_DWORD_FETCH_STORE |
32 | __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE |
33 | __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE |
34 | __TBB_USE_GENERIC_RELAXED_LOAD_STORE |
35 | __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE |
36 | |
37 | In this case tbb_machine.h will add missing functionality based on a minimal set |
38 | of APIs that are required to be implemented by all plug-n headers as described |
39 | further. |
40 | Note that these generic implementations may be sub-optimal for a particular |
41 | architecture, and thus should be relied upon only after careful evaluation |
42 | or as the last resort. |
43 | |
44 | Additionally __TBB_64BIT_ATOMICS can be set to 0 on a 32-bit architecture to |
45 | indicate that the port is not going to support double word atomics. It may also |
46 | be set to 1 explicitly, though normally this is not necessary as tbb_machine.h |
47 | will set it automatically. |
48 | |
49 | __TBB_ENDIANNESS macro can be defined by the implementation as well. |
50 | It is used only if __TBB_USE_GENERIC_PART_WORD_CAS is set (or for testing), |
51 | and must specify the layout of aligned 16-bit and 32-bit data anywhere within a process |
52 | (while the details of unaligned 16-bit or 32-bit data or of 64-bit data are irrelevant). |
53 | The layout must be the same at all relevant memory locations within the current process; |
54 | in case of page-specific endianness, one endianness must be kept "out of sight". |
55 | Possible settings, reflecting hardware and possibly O.S. convention, are: |
56 | - __TBB_ENDIAN_BIG for big-endian data, |
57 | - __TBB_ENDIAN_LITTLE for little-endian data, |
58 | - __TBB_ENDIAN_DETECT for run-time detection iff exactly one of the above, |
59 | - __TBB_ENDIAN_UNSUPPORTED to prevent undefined behavior if none of the above. |
60 | |
61 | Prerequisites for each architecture port |
62 | ---------------------------------------- |
63 | The following functions and macros have no generic implementation. Therefore they must be |
64 | implemented in each machine architecture specific header either as a conventional |
65 | function or as a functional macro. |
66 | |
67 | __TBB_WORDSIZE |
68 | This is the size of machine word in bytes, i.e. for 32 bit systems it |
69 | should be defined to 4. |
70 | |
71 | __TBB_Yield() |
72 | Signals OS that the current thread is willing to relinquish the remainder |
73 | of its time quantum. |
74 | |
75 | __TBB_full_memory_fence() |
76 | Must prevent all memory operations from being reordered across it (both |
77 | by hardware and compiler). All such fences must be totally ordered (or |
78 | sequentially consistent). |
79 | |
80 | __TBB_machine_cmpswp4( volatile void *ptr, int32_t value, int32_t comparand ) |
81 | Must be provided if __TBB_USE_FENCED_ATOMICS is not set. |
82 | |
83 | __TBB_machine_cmpswp8( volatile void *ptr, int32_t value, int64_t comparand ) |
84 | Must be provided for 64-bit architectures if __TBB_USE_FENCED_ATOMICS is not set, |
85 | and for 32-bit architectures if __TBB_64BIT_ATOMICS is set |
86 | |
87 | __TBB_machine_<op><S><fence>(...), where |
88 | <op> = {cmpswp, fetchadd, fetchstore} |
89 | <S> = {1, 2, 4, 8} |
90 | <fence> = {full_fence, acquire, release, relaxed} |
91 | Must be provided if __TBB_USE_FENCED_ATOMICS is set. |
92 | |
93 | __TBB_control_consistency_helper() |
94 | Bridges the memory-semantics gap between architectures providing only |
95 | implicit C++0x "consume" semantics (like Power Architecture) and those |
96 | also implicitly obeying control dependencies (like IA-64 architecture). |
97 | It must be used only in conditional code where the condition is itself |
98 | data-dependent, and will then make subsequent code behave as if the |
99 | original data dependency were acquired. |
100 | It needs only a compiler fence where implied by the architecture |
101 | either specifically (like IA-64 architecture) or because generally stronger |
102 | "acquire" semantics are enforced (like x86). |
103 | It is always valid, though potentially suboptimal, to replace |
104 | control with acquire on the load and then remove the helper. |
105 | |
106 | __TBB_acquire_consistency_helper(), __TBB_release_consistency_helper() |
107 | Must be provided if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE is set. |
108 | Enforce acquire and release semantics in generic implementations of fenced |
109 | store and load operations. Depending on the particular architecture/compiler |
110 | combination they may be a hardware fence, a compiler fence, both or nothing. |
111 | **/ |
112 | |
113 | #include "tbb_stddef.h" |
114 | |
115 | namespace tbb { |
116 | namespace internal { //< @cond INTERNAL |
117 | |
118 | //////////////////////////////////////////////////////////////////////////////// |
119 | // Overridable helpers declarations |
120 | // |
121 | // A machine/*.h file may choose to define these templates, otherwise it must |
122 | // request default implementation by setting appropriate __TBB_USE_GENERIC_XXX macro(s). |
123 | // |
124 | template <typename T, std::size_t S> |
125 | struct machine_load_store; |
126 | |
127 | template <typename T, std::size_t S> |
128 | struct machine_load_store_relaxed; |
129 | |
130 | template <typename T, std::size_t S> |
131 | struct machine_load_store_seq_cst; |
132 | // |
133 | // End of overridable helpers declarations |
134 | //////////////////////////////////////////////////////////////////////////////// |
135 | |
136 | template<size_t S> struct atomic_selector; |
137 | |
138 | template<> struct atomic_selector<1> { |
139 | typedef int8_t word; |
140 | inline static word fetch_store ( volatile void* location, word value ); |
141 | }; |
142 | |
143 | template<> struct atomic_selector<2> { |
144 | typedef int16_t word; |
145 | inline static word fetch_store ( volatile void* location, word value ); |
146 | }; |
147 | |
148 | template<> struct atomic_selector<4> { |
149 | #if _MSC_VER && !_WIN64 |
150 | // Work-around that avoids spurious /Wp64 warnings |
151 | typedef intptr_t word; |
152 | #else |
153 | typedef int32_t word; |
154 | #endif |
155 | inline static word fetch_store ( volatile void* location, word value ); |
156 | }; |
157 | |
158 | template<> struct atomic_selector<8> { |
159 | typedef int64_t word; |
160 | inline static word fetch_store ( volatile void* location, word value ); |
161 | }; |
162 | |
163 | }} //< namespaces internal @endcond, tbb |
164 | |
165 | #define __TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(M) \ |
166 | inline void __TBB_machine_generic_store8##M(volatile void *ptr, int64_t value) { \ |
167 | for(;;) { \ |
168 | int64_t result = *(volatile int64_t *)ptr; \ |
169 | if( __TBB_machine_cmpswp8##M(ptr,value,result)==result ) break; \ |
170 | } \ |
171 | } \ |
172 | |
173 | #define __TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(M) \ |
174 | inline int64_t __TBB_machine_generic_load8##M(const volatile void *ptr) { \ |
175 | /* Comparand and new value may be anything, they only must be equal, and */ \ |
176 | /* the value should have a low probability to be actually found in 'location'.*/ \ |
177 | const int64_t anyvalue = 2305843009213693951LL; \ |
178 | return __TBB_machine_cmpswp8##M(const_cast<volatile void *>(ptr),anyvalue,anyvalue); \ |
179 | } \ |
180 | |
181 | // The set of allowed values for __TBB_ENDIANNESS (see above for details) |
182 | #define __TBB_ENDIAN_UNSUPPORTED -1 |
183 | #define __TBB_ENDIAN_LITTLE 0 |
184 | #define __TBB_ENDIAN_BIG 1 |
185 | #define __TBB_ENDIAN_DETECT 2 |
186 | |
187 | #if _WIN32||_WIN64 |
188 | |
189 | #ifdef _MANAGED |
190 | #pragma managed(push, off) |
191 | #endif |
192 | |
193 | #if __MINGW64__ || __MINGW32__ |
194 | extern "C" __declspec(dllimport) int __stdcall SwitchToThread( void ); |
195 | #define __TBB_Yield() SwitchToThread() |
196 | #if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT) |
197 | #include "machine/gcc_generic.h" |
198 | #elif __MINGW64__ |
199 | #include "machine/linux_intel64.h" |
200 | #elif __MINGW32__ |
201 | #include "machine/linux_ia32.h" |
202 | #endif |
203 | #elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) |
204 | #include "machine/icc_generic.h" |
205 | #elif defined(_M_IX86) && !defined(__TBB_WIN32_USE_CL_BUILTINS) |
206 | #include "machine/windows_ia32.h" |
207 | #elif defined(_M_X64) |
208 | #include "machine/windows_intel64.h" |
209 | #elif defined(_M_ARM) || defined(__TBB_WIN32_USE_CL_BUILTINS) |
210 | #include "machine/msvc_armv7.h" |
211 | #endif |
212 | |
213 | #ifdef _MANAGED |
214 | #pragma managed(pop) |
215 | #endif |
216 | |
217 | #elif __TBB_DEFINE_MIC |
218 | |
219 | #include "machine/mic_common.h" |
220 | #if (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) |
221 | #include "machine/icc_generic.h" |
222 | #else |
223 | #include "machine/linux_intel64.h" |
224 | #endif |
225 | |
226 | #elif __linux__ || __FreeBSD__ || __NetBSD__ || __OpenBSD__ |
227 | |
228 | #if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT) |
229 | #include "machine/gcc_generic.h" |
230 | #elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) |
231 | #include "machine/icc_generic.h" |
232 | #elif __i386__ |
233 | #include "machine/linux_ia32.h" |
234 | #elif __x86_64__ |
235 | #include "machine/linux_intel64.h" |
236 | #elif __ia64__ |
237 | #include "machine/linux_ia64.h" |
238 | #elif __powerpc__ |
239 | #include "machine/mac_ppc.h" |
240 | #elif __ARM_ARCH_7A__ || __aarch64__ |
241 | #include "machine/gcc_arm.h" |
242 | #elif __TBB_GCC_BUILTIN_ATOMICS_PRESENT |
243 | #include "machine/gcc_generic.h" |
244 | #endif |
245 | #include "machine/linux_common.h" |
246 | |
247 | #elif __APPLE__ |
248 | //TODO: TBB_USE_GCC_BUILTINS is not used for Mac, Sun, Aix |
249 | #if (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT) |
250 | #include "machine/icc_generic.h" |
251 | #elif __TBB_x86_32 |
252 | #include "machine/linux_ia32.h" |
253 | #elif __TBB_x86_64 |
254 | #include "machine/linux_intel64.h" |
255 | #elif __POWERPC__ |
256 | #include "machine/mac_ppc.h" |
257 | #endif |
258 | #include "machine/macos_common.h" |
259 | |
260 | #elif _AIX |
261 | |
262 | #include "machine/ibm_aix51.h" |
263 | |
264 | #elif __sun || __SUNPRO_CC |
265 | |
266 | #define __asm__ asm |
267 | #define __volatile__ volatile |
268 | |
269 | #if __i386 || __i386__ |
270 | #include "machine/linux_ia32.h" |
271 | #elif __x86_64__ |
272 | #include "machine/linux_intel64.h" |
273 | #elif __sparc |
274 | #include "machine/sunos_sparc.h" |
275 | #endif |
276 | #include <sched.h> |
277 | |
278 | #define __TBB_Yield() sched_yield() |
279 | |
280 | #endif /* OS selection */ |
281 | |
282 | #ifndef __TBB_64BIT_ATOMICS |
283 | #define __TBB_64BIT_ATOMICS 1 |
284 | #endif |
285 | |
286 | //TODO: replace usage of these functions with usage of tbb::atomic, and then remove them |
287 | //TODO: map functions with W suffix to use cast to tbb::atomic and according op, i.e. as_atomic().op() |
288 | // Special atomic functions |
289 | #if __TBB_USE_FENCED_ATOMICS |
290 | #define __TBB_machine_cmpswp1 __TBB_machine_cmpswp1full_fence |
291 | #define __TBB_machine_cmpswp2 __TBB_machine_cmpswp2full_fence |
292 | #define __TBB_machine_cmpswp4 __TBB_machine_cmpswp4full_fence |
293 | #define __TBB_machine_cmpswp8 __TBB_machine_cmpswp8full_fence |
294 | |
295 | #if __TBB_WORDSIZE==8 |
296 | #define __TBB_machine_fetchadd8 __TBB_machine_fetchadd8full_fence |
297 | #define __TBB_machine_fetchstore8 __TBB_machine_fetchstore8full_fence |
298 | #define __TBB_FetchAndAddWrelease(P,V) __TBB_machine_fetchadd8release(P,V) |
299 | #define __TBB_FetchAndIncrementWacquire(P) __TBB_machine_fetchadd8acquire(P,1) |
300 | #define __TBB_FetchAndDecrementWrelease(P) __TBB_machine_fetchadd8release(P,(-1)) |
301 | #else |
302 | #define __TBB_machine_fetchadd4 __TBB_machine_fetchadd4full_fence |
303 | #define __TBB_machine_fetchstore4 __TBB_machine_fetchstore4full_fence |
304 | #define __TBB_FetchAndAddWrelease(P,V) __TBB_machine_fetchadd4release(P,V) |
305 | #define __TBB_FetchAndIncrementWacquire(P) __TBB_machine_fetchadd4acquire(P,1) |
306 | #define __TBB_FetchAndDecrementWrelease(P) __TBB_machine_fetchadd4release(P,(-1)) |
307 | #endif /* __TBB_WORDSIZE==4 */ |
308 | #else /* !__TBB_USE_FENCED_ATOMICS */ |
309 | #define __TBB_FetchAndAddWrelease(P,V) __TBB_FetchAndAddW(P,V) |
310 | #define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAddW(P,1) |
311 | #define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAddW(P,(-1)) |
312 | #endif /* !__TBB_USE_FENCED_ATOMICS */ |
313 | |
314 | #if __TBB_WORDSIZE==4 |
315 | #define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp4(P,V,C) |
316 | #define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd4(P,V) |
317 | #define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstore4(P,V) |
318 | #elif __TBB_WORDSIZE==8 |
319 | #if __TBB_USE_GENERIC_DWORD_LOAD_STORE || __TBB_USE_GENERIC_DWORD_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_STORE |
320 | #error These macros should only be used on 32-bit platforms. |
321 | #endif |
322 | |
323 | #define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp8(P,V,C) |
324 | #define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd8(P,V) |
325 | #define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstore8(P,V) |
326 | #else /* __TBB_WORDSIZE != 8 */ |
327 | #error Unsupported machine word size. |
328 | #endif /* __TBB_WORDSIZE */ |
329 | |
330 | #ifndef __TBB_Pause |
331 | inline void __TBB_Pause(int32_t) { |
332 | __TBB_Yield(); |
333 | } |
334 | #endif |
335 | |
336 | namespace tbb { |
337 | |
338 | //! Sequentially consistent full memory fence. |
339 | inline void atomic_fence () { __TBB_full_memory_fence(); } |
340 | |
341 | namespace internal { //< @cond INTERNAL |
342 | |
343 | //! Class that implements exponential backoff. |
344 | /** See implementation of spin_wait_while_eq for an example. */ |
345 | class atomic_backoff : no_copy { |
346 | //! Time delay, in units of "pause" instructions. |
347 | /** Should be equal to approximately the number of "pause" instructions |
348 | that take the same time as an context switch. Must be a power of two.*/ |
349 | static const int32_t LOOPS_BEFORE_YIELD = 16; |
350 | int32_t count; |
351 | public: |
352 | // In many cases, an object of this type is initialized eagerly on hot path, |
353 | // as in for(atomic_backoff b; ; b.pause()) { /*loop body*/ } |
354 | // For this reason, the construction cost must be very small! |
355 | atomic_backoff() : count(1) {} |
356 | // This constructor pauses immediately; do not use on hot paths! |
357 | atomic_backoff( bool ) : count(1) { pause(); } |
358 | |
359 | //! Pause for a while. |
360 | void pause() { |
361 | if( count<=LOOPS_BEFORE_YIELD ) { |
362 | __TBB_Pause(count); |
363 | // Pause twice as long the next time. |
364 | count*=2; |
365 | } else { |
366 | // Pause is so long that we might as well yield CPU to scheduler. |
367 | __TBB_Yield(); |
368 | } |
369 | } |
370 | |
371 | //! Pause for a few times and return false if saturated. |
372 | bool bounded_pause() { |
373 | __TBB_Pause(count); |
374 | if( count<LOOPS_BEFORE_YIELD ) { |
375 | // Pause twice as long the next time. |
376 | count*=2; |
377 | return true; |
378 | } else { |
379 | return false; |
380 | } |
381 | } |
382 | |
383 | void reset() { |
384 | count = 1; |
385 | } |
386 | }; |
387 | |
388 | //! Spin WHILE the value of the variable is equal to a given value |
389 | /** T and U should be comparable types. */ |
390 | template<typename T, typename U> |
391 | void spin_wait_while_eq( const volatile T& location, U value ) { |
392 | atomic_backoff backoff; |
393 | while( location==value ) backoff.pause(); |
394 | } |
395 | |
396 | //! Spin UNTIL the value of the variable is equal to a given value |
397 | /** T and U should be comparable types. */ |
398 | template<typename T, typename U> |
399 | void spin_wait_until_eq( const volatile T& location, const U value ) { |
400 | atomic_backoff backoff; |
401 | while( location!=value ) backoff.pause(); |
402 | } |
403 | |
404 | template <typename predicate_type> |
405 | void spin_wait_while(predicate_type condition){ |
406 | atomic_backoff backoff; |
407 | while( condition() ) backoff.pause(); |
408 | } |
409 | |
410 | //////////////////////////////////////////////////////////////////////////////// |
411 | // Generic compare-and-swap applied to only a part of a machine word. |
412 | // |
413 | #ifndef __TBB_ENDIANNESS |
414 | #define __TBB_ENDIANNESS __TBB_ENDIAN_DETECT |
415 | #endif |
416 | |
417 | #if __TBB_USE_GENERIC_PART_WORD_CAS && __TBB_ENDIANNESS==__TBB_ENDIAN_UNSUPPORTED |
418 | #error Generic implementation of part-word CAS may not be used with __TBB_ENDIAN_UNSUPPORTED |
419 | #endif |
420 | |
421 | #if __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED |
422 | // |
423 | // This function is the only use of __TBB_ENDIANNESS. |
424 | // The following restrictions/limitations apply for this operation: |
425 | // - T must be an integer type of at most 4 bytes for the casts and calculations to work |
426 | // - T must also be less than 4 bytes to avoid compiler warnings when computing mask |
427 | // (and for the operation to be useful at all, so no workaround is applied) |
428 | // - the architecture must consistently use either little-endian or big-endian (same for all locations) |
429 | // |
430 | // TODO: static_assert for the type requirements stated above |
431 | template<typename T> |
432 | inline T __TBB_MaskedCompareAndSwap (volatile T * const ptr, const T value, const T comparand ) { |
433 | struct endianness{ static bool is_big_endian(){ |
434 | #if __TBB_ENDIANNESS==__TBB_ENDIAN_DETECT |
435 | const uint32_t probe = 0x03020100; |
436 | return (((const char*)(&probe))[0]==0x03); |
437 | #elif __TBB_ENDIANNESS==__TBB_ENDIAN_BIG || __TBB_ENDIANNESS==__TBB_ENDIAN_LITTLE |
438 | return __TBB_ENDIANNESS==__TBB_ENDIAN_BIG; |
439 | #else |
440 | #error Unexpected value of __TBB_ENDIANNESS |
441 | #endif |
442 | }}; |
443 | |
444 | const uint32_t byte_offset = (uint32_t) ((uintptr_t)ptr & 0x3); |
445 | volatile uint32_t * const aligned_ptr = (uint32_t*)((uintptr_t)ptr - byte_offset ); |
446 | |
447 | // location of T within uint32_t for a C++ shift operation |
448 | const uint32_t bits_to_shift = 8*(endianness::is_big_endian() ? (4 - sizeof(T) - (byte_offset)) : byte_offset); |
449 | const uint32_t mask = (((uint32_t)1<<(sizeof(T)*8)) - 1 )<<bits_to_shift; |
450 | // for signed T, any sign extension bits in cast value/comparand are immediately clipped by mask |
451 | const uint32_t shifted_comparand = ((uint32_t)comparand << bits_to_shift)&mask; |
452 | const uint32_t shifted_value = ((uint32_t)value << bits_to_shift)&mask; |
453 | |
454 | for( atomic_backoff b;;b.pause() ) { |
455 | const uint32_t surroundings = *aligned_ptr & ~mask ; // may have changed during the pause |
456 | const uint32_t big_comparand = surroundings | shifted_comparand ; |
457 | const uint32_t big_value = surroundings | shifted_value ; |
458 | // __TBB_machine_cmpswp4 presumed to have full fence. |
459 | // Cast shuts up /Wp64 warning |
460 | const uint32_t big_result = (uint32_t)__TBB_machine_cmpswp4( aligned_ptr, big_value, big_comparand ); |
461 | if( big_result == big_comparand // CAS succeeded |
462 | || ((big_result ^ big_comparand) & mask) != 0) // CAS failed and the bits of interest have changed |
463 | { |
464 | return T((big_result & mask) >> bits_to_shift); |
465 | } |
466 | else continue; // CAS failed but the bits of interest were not changed |
467 | } |
468 | } |
469 | #endif // __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED |
470 | //////////////////////////////////////////////////////////////////////////////// |
471 | |
472 | template<size_t S, typename T> |
473 | inline T __TBB_CompareAndSwapGeneric (volatile void *ptr, T value, T comparand ); |
474 | |
475 | template<> |
476 | inline int8_t __TBB_CompareAndSwapGeneric <1,int8_t> (volatile void *ptr, int8_t value, int8_t comparand ) { |
477 | #if __TBB_USE_GENERIC_PART_WORD_CAS |
478 | return __TBB_MaskedCompareAndSwap<int8_t>((volatile int8_t *)ptr,value,comparand); |
479 | #else |
480 | return __TBB_machine_cmpswp1(ptr,value,comparand); |
481 | #endif |
482 | } |
483 | |
484 | template<> |
485 | inline int16_t __TBB_CompareAndSwapGeneric <2,int16_t> (volatile void *ptr, int16_t value, int16_t comparand ) { |
486 | #if __TBB_USE_GENERIC_PART_WORD_CAS |
487 | return __TBB_MaskedCompareAndSwap<int16_t>((volatile int16_t *)ptr,value,comparand); |
488 | #else |
489 | return __TBB_machine_cmpswp2(ptr,value,comparand); |
490 | #endif |
491 | } |
492 | |
493 | template<> |
494 | inline int32_t __TBB_CompareAndSwapGeneric <4,int32_t> (volatile void *ptr, int32_t value, int32_t comparand ) { |
495 | // Cast shuts up /Wp64 warning |
496 | return (int32_t)__TBB_machine_cmpswp4(ptr,value,comparand); |
497 | } |
498 | |
499 | #if __TBB_64BIT_ATOMICS |
500 | template<> |
501 | inline int64_t __TBB_CompareAndSwapGeneric <8,int64_t> (volatile void *ptr, int64_t value, int64_t comparand ) { |
502 | return __TBB_machine_cmpswp8(ptr,value,comparand); |
503 | } |
504 | #endif |
505 | |
506 | template<size_t S, typename T> |
507 | inline T __TBB_FetchAndAddGeneric (volatile void *ptr, T addend) { |
508 | T result; |
509 | for( atomic_backoff b;;b.pause() ) { |
510 | result = *reinterpret_cast<volatile T *>(ptr); |
511 | // __TBB_CompareAndSwapGeneric presumed to have full fence. |
512 | if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, result+addend, result )==result ) |
513 | break; |
514 | } |
515 | return result; |
516 | } |
517 | |
518 | template<size_t S, typename T> |
519 | inline T __TBB_FetchAndStoreGeneric (volatile void *ptr, T value) { |
520 | T result; |
521 | for( atomic_backoff b;;b.pause() ) { |
522 | result = *reinterpret_cast<volatile T *>(ptr); |
523 | // __TBB_CompareAndSwapGeneric presumed to have full fence. |
524 | if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, value, result )==result ) |
525 | break; |
526 | } |
527 | return result; |
528 | } |
529 | |
530 | #if __TBB_USE_GENERIC_PART_WORD_CAS |
531 | #define __TBB_machine_cmpswp1 tbb::internal::__TBB_CompareAndSwapGeneric<1,int8_t> |
532 | #define __TBB_machine_cmpswp2 tbb::internal::__TBB_CompareAndSwapGeneric<2,int16_t> |
533 | #endif |
534 | |
535 | #if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_PART_WORD_FETCH_ADD |
536 | #define __TBB_machine_fetchadd1 tbb::internal::__TBB_FetchAndAddGeneric<1,int8_t> |
537 | #define __TBB_machine_fetchadd2 tbb::internal::__TBB_FetchAndAddGeneric<2,int16_t> |
538 | #endif |
539 | |
540 | #if __TBB_USE_GENERIC_FETCH_ADD |
541 | #define __TBB_machine_fetchadd4 tbb::internal::__TBB_FetchAndAddGeneric<4,int32_t> |
542 | #endif |
543 | |
544 | #if __TBB_USE_GENERIC_FETCH_ADD || __TBB_USE_GENERIC_DWORD_FETCH_ADD |
545 | #define __TBB_machine_fetchadd8 tbb::internal::__TBB_FetchAndAddGeneric<8,int64_t> |
546 | #endif |
547 | |
548 | #if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_PART_WORD_FETCH_STORE |
549 | #define __TBB_machine_fetchstore1 tbb::internal::__TBB_FetchAndStoreGeneric<1,int8_t> |
550 | #define __TBB_machine_fetchstore2 tbb::internal::__TBB_FetchAndStoreGeneric<2,int16_t> |
551 | #endif |
552 | |
553 | #if __TBB_USE_GENERIC_FETCH_STORE |
554 | #define __TBB_machine_fetchstore4 tbb::internal::__TBB_FetchAndStoreGeneric<4,int32_t> |
555 | #endif |
556 | |
557 | #if __TBB_USE_GENERIC_FETCH_STORE || __TBB_USE_GENERIC_DWORD_FETCH_STORE |
558 | #define __TBB_machine_fetchstore8 tbb::internal::__TBB_FetchAndStoreGeneric<8,int64_t> |
559 | #endif |
560 | |
561 | #if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE |
562 | #define __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(S) \ |
563 | atomic_selector<S>::word atomic_selector<S>::fetch_store ( volatile void* location, word value ) { \ |
564 | return __TBB_machine_fetchstore##S( location, value ); \ |
565 | } |
566 | |
567 | __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(1) |
568 | __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(2) |
569 | __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(4) |
570 | __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(8) |
571 | |
572 | #undef __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE |
573 | #endif /* __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */ |
574 | |
575 | #if __TBB_USE_GENERIC_DWORD_LOAD_STORE |
576 | /*TODO: find a more elegant way to handle function names difference*/ |
577 | #if ! __TBB_USE_FENCED_ATOMICS |
578 | /* This name forwarding is needed for generic implementation of |
579 | * load8/store8 defined below (via macro) to pick the right CAS function*/ |
580 | #define __TBB_machine_cmpswp8full_fence __TBB_machine_cmpswp8 |
581 | #endif |
582 | __TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(full_fence) |
583 | __TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(full_fence) |
584 | |
585 | #if ! __TBB_USE_FENCED_ATOMICS |
586 | #undef __TBB_machine_cmpswp8full_fence |
587 | #endif |
588 | |
589 | #define __TBB_machine_store8 tbb::internal::__TBB_machine_generic_store8full_fence |
590 | #define __TBB_machine_load8 tbb::internal::__TBB_machine_generic_load8full_fence |
591 | #endif /* __TBB_USE_GENERIC_DWORD_LOAD_STORE */ |
592 | |
593 | #if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE |
594 | /** Fenced operations use volatile qualifier to prevent compiler from optimizing |
595 | them out, and on architectures with weak memory ordering to induce compiler |
596 | to generate code with appropriate acquire/release semantics. |
597 | On architectures like IA32, Intel64 (and likely Sparc TSO) volatile has |
598 | no effect on code gen, and consistency helpers serve as a compiler fence (the |
599 | latter being true for IA64/gcc as well to fix a bug in some gcc versions). |
600 | This code assumes that the generated instructions will operate atomically, |
601 | which typically requires a type that can be moved in a single instruction, |
602 | cooperation from the compiler for effective use of such an instruction, |
603 | and appropriate alignment of the data. **/ |
604 | template <typename T, size_t S> |
605 | struct machine_load_store { |
606 | static T load_with_acquire ( const volatile T& location ) { |
607 | T to_return = location; |
608 | __TBB_acquire_consistency_helper(); |
609 | return to_return; |
610 | } |
611 | static void store_with_release ( volatile T &location, T value ) { |
612 | __TBB_release_consistency_helper(); |
613 | location = value; |
614 | } |
615 | }; |
616 | |
617 | //in general, plain load and store of 32bit compiler is not atomic for 64bit types |
618 | #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS |
619 | template <typename T> |
620 | struct machine_load_store<T,8> { |
621 | static T load_with_acquire ( const volatile T& location ) { |
622 | return (T)__TBB_machine_load8( (const volatile void*)&location ); |
623 | } |
624 | static void store_with_release ( volatile T& location, T value ) { |
625 | __TBB_machine_store8( (volatile void*)&location, (int64_t)value ); |
626 | } |
627 | }; |
628 | #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */ |
629 | #endif /* __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE */ |
630 | |
631 | #if __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE |
632 | template <typename T, size_t S> |
633 | struct machine_load_store_seq_cst { |
634 | static T load ( const volatile T& location ) { |
635 | __TBB_full_memory_fence(); |
636 | return machine_load_store<T,S>::load_with_acquire( location ); |
637 | } |
638 | #if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE |
639 | static void store ( volatile T &location, T value ) { |
640 | atomic_selector<S>::fetch_store( (volatile void*)&location, (typename atomic_selector<S>::word)value ); |
641 | } |
642 | #else /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */ |
643 | static void store ( volatile T &location, T value ) { |
644 | machine_load_store<T,S>::store_with_release( location, value ); |
645 | __TBB_full_memory_fence(); |
646 | } |
647 | #endif /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */ |
648 | }; |
649 | |
650 | #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS |
651 | /** The implementation does not use functions __TBB_machine_load8/store8 as they |
652 | are not required to be sequentially consistent. **/ |
653 | template <typename T> |
654 | struct machine_load_store_seq_cst<T,8> { |
655 | static T load ( const volatile T& location ) { |
656 | // Comparand and new value may be anything, they only must be equal, and |
657 | // the value should have a low probability to be actually found in 'location'. |
658 | const int64_t anyvalue = 2305843009213693951LL; |
659 | return __TBB_machine_cmpswp8( (volatile void*)const_cast<volatile T*>(&location), anyvalue, anyvalue ); |
660 | } |
661 | static void store ( volatile T &location, T value ) { |
662 | #if __TBB_GCC_VERSION >= 40702 |
663 | #pragma GCC diagnostic push |
664 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" |
665 | #endif |
666 | // An atomic initialization leads to reading of uninitialized memory |
667 | int64_t result = (volatile int64_t&)location; |
668 | #if __TBB_GCC_VERSION >= 40702 |
669 | #pragma GCC diagnostic pop |
670 | #endif |
671 | while ( __TBB_machine_cmpswp8((volatile void*)&location, (int64_t)value, result) != result ) |
672 | result = (volatile int64_t&)location; |
673 | } |
674 | }; |
675 | #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */ |
676 | #endif /*__TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE */ |
677 | |
678 | #if __TBB_USE_GENERIC_RELAXED_LOAD_STORE |
679 | // Relaxed operations add volatile qualifier to prevent compiler from optimizing them out. |
680 | /** Volatile should not incur any additional cost on IA32, Intel64, and Sparc TSO |
681 | architectures. However on architectures with weak memory ordering compiler may |
682 | generate code with acquire/release semantics for operations on volatile data. **/ |
683 | template <typename T, size_t S> |
684 | struct machine_load_store_relaxed { |
685 | static inline T load ( const volatile T& location ) { |
686 | return location; |
687 | } |
688 | static inline void store ( volatile T& location, T value ) { |
689 | location = value; |
690 | } |
691 | }; |
692 | |
693 | #if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS |
694 | template <typename T> |
695 | struct machine_load_store_relaxed<T,8> { |
696 | static inline T load ( const volatile T& location ) { |
697 | return (T)__TBB_machine_load8( (const volatile void*)&location ); |
698 | } |
699 | static inline void store ( volatile T& location, T value ) { |
700 | __TBB_machine_store8( (volatile void*)&location, (int64_t)value ); |
701 | } |
702 | }; |
703 | #endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */ |
704 | #endif /* __TBB_USE_GENERIC_RELAXED_LOAD_STORE */ |
705 | |
706 | #undef __TBB_WORDSIZE //this macro is forbidden to use outside of atomic machinery |
707 | |
708 | template<typename T> |
709 | inline T __TBB_load_with_acquire(const volatile T &location) { |
710 | return machine_load_store<T,sizeof(T)>::load_with_acquire( location ); |
711 | } |
712 | template<typename T, typename V> |
713 | inline void __TBB_store_with_release(volatile T& location, V value) { |
714 | machine_load_store<T,sizeof(T)>::store_with_release( location, T(value) ); |
715 | } |
716 | //! Overload that exists solely to avoid /Wp64 warnings. |
717 | inline void __TBB_store_with_release(volatile size_t& location, size_t value) { |
718 | machine_load_store<size_t,sizeof(size_t)>::store_with_release( location, value ); |
719 | } |
720 | |
721 | template<typename T> |
722 | inline T __TBB_load_full_fence(const volatile T &location) { |
723 | return machine_load_store_seq_cst<T,sizeof(T)>::load( location ); |
724 | } |
725 | template<typename T, typename V> |
726 | inline void __TBB_store_full_fence(volatile T& location, V value) { |
727 | machine_load_store_seq_cst<T,sizeof(T)>::store( location, T(value) ); |
728 | } |
729 | //! Overload that exists solely to avoid /Wp64 warnings. |
730 | inline void __TBB_store_full_fence(volatile size_t& location, size_t value) { |
731 | machine_load_store_seq_cst<size_t,sizeof(size_t)>::store( location, value ); |
732 | } |
733 | |
734 | template<typename T> |
735 | inline T __TBB_load_relaxed (const volatile T& location) { |
736 | return machine_load_store_relaxed<T,sizeof(T)>::load( const_cast<T&>(location) ); |
737 | } |
738 | template<typename T, typename V> |
739 | inline void __TBB_store_relaxed ( volatile T& location, V value ) { |
740 | machine_load_store_relaxed<T,sizeof(T)>::store( const_cast<T&>(location), T(value) ); |
741 | } |
742 | //! Overload that exists solely to avoid /Wp64 warnings. |
743 | inline void __TBB_store_relaxed ( volatile size_t& location, size_t value ) { |
744 | machine_load_store_relaxed<size_t,sizeof(size_t)>::store( const_cast<size_t&>(location), value ); |
745 | } |
746 | |
747 | // Macro __TBB_TypeWithAlignmentAtLeastAsStrict(T) should be a type with alignment at least as |
748 | // strict as type T. The type should have a trivial default constructor and destructor, so that |
749 | // arrays of that type can be declared without initializers. |
750 | // It is correct (but perhaps a waste of space) if __TBB_TypeWithAlignmentAtLeastAsStrict(T) expands |
751 | // to a type bigger than T. |
752 | // The default definition here works on machines where integers are naturally aligned and the |
753 | // strictest alignment is 64. |
754 | #ifndef __TBB_TypeWithAlignmentAtLeastAsStrict |
755 | |
756 | #if __TBB_ALIGNAS_PRESENT |
757 | |
758 | // Use C++11 keywords alignas and alignof |
759 | #define __TBB_DefineTypeWithAlignment(PowerOf2) \ |
760 | struct alignas(PowerOf2) __TBB_machine_type_with_alignment_##PowerOf2 { \ |
761 | uint32_t member[PowerOf2/sizeof(uint32_t)]; \ |
762 | }; |
763 | #define __TBB_alignof(T) alignof(T) |
764 | |
765 | #elif __TBB_ATTRIBUTE_ALIGNED_PRESENT |
766 | |
767 | #define __TBB_DefineTypeWithAlignment(PowerOf2) \ |
768 | struct __TBB_machine_type_with_alignment_##PowerOf2 { \ |
769 | uint32_t member[PowerOf2/sizeof(uint32_t)]; \ |
770 | } __attribute__((aligned(PowerOf2))); |
771 | #define __TBB_alignof(T) __alignof__(T) |
772 | |
773 | #elif __TBB_DECLSPEC_ALIGN_PRESENT |
774 | |
775 | #define __TBB_DefineTypeWithAlignment(PowerOf2) \ |
776 | __declspec(align(PowerOf2)) \ |
777 | struct __TBB_machine_type_with_alignment_##PowerOf2 { \ |
778 | uint32_t member[PowerOf2/sizeof(uint32_t)]; \ |
779 | }; |
780 | #define __TBB_alignof(T) __alignof(T) |
781 | |
782 | #else /* A compiler with unknown syntax for data alignment */ |
783 | #error Must define __TBB_TypeWithAlignmentAtLeastAsStrict(T) |
784 | #endif |
785 | |
786 | /* Now declare types aligned to useful powers of two */ |
787 | __TBB_DefineTypeWithAlignment(8) // i386 ABI says that uint64_t is aligned on 4 bytes |
788 | __TBB_DefineTypeWithAlignment(16) |
789 | __TBB_DefineTypeWithAlignment(32) |
790 | __TBB_DefineTypeWithAlignment(64) |
791 | |
792 | typedef __TBB_machine_type_with_alignment_64 __TBB_machine_type_with_strictest_alignment; |
793 | |
794 | // Primary template is a declaration of incomplete type so that it fails with unknown alignments |
795 | template<size_t N> struct type_with_alignment; |
796 | |
797 | // Specializations for allowed alignments |
798 | template<> struct type_with_alignment<1> { char member; }; |
799 | template<> struct type_with_alignment<2> { uint16_t member; }; |
800 | template<> struct type_with_alignment<4> { uint32_t member; }; |
801 | template<> struct type_with_alignment<8> { __TBB_machine_type_with_alignment_8 member; }; |
802 | template<> struct type_with_alignment<16> {__TBB_machine_type_with_alignment_16 member; }; |
803 | template<> struct type_with_alignment<32> {__TBB_machine_type_with_alignment_32 member; }; |
804 | template<> struct type_with_alignment<64> {__TBB_machine_type_with_alignment_64 member; }; |
805 | |
806 | #if __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN |
807 | //! Work around for bug in GNU 3.2 and MSVC compilers. |
808 | /** Bug is that compiler sometimes returns 0 for __alignof(T) when T has not yet been instantiated. |
809 | The work-around forces instantiation by forcing computation of sizeof(T) before __alignof(T). */ |
810 | template<size_t Size, typename T> |
811 | struct work_around_alignment_bug { |
812 | static const size_t alignment = __TBB_alignof(T); |
813 | }; |
814 | #define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<tbb::internal::work_around_alignment_bug<sizeof(T),T>::alignment> |
815 | #else |
816 | #define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<__TBB_alignof(T)> |
817 | #endif /* __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN */ |
818 | |
819 | #endif /* __TBB_TypeWithAlignmentAtLeastAsStrict */ |
820 | |
821 | // Template class here is to avoid instantiation of the static data for modules that don't use it |
822 | template<typename T> |
823 | struct reverse { |
824 | static const T byte_table[256]; |
825 | }; |
826 | // An efficient implementation of the reverse function utilizes a 2^8 lookup table holding the bit-reversed |
827 | // values of [0..2^8 - 1]. Those values can also be computed on the fly at a slightly higher cost. |
828 | template<typename T> |
829 | const T reverse<T>::byte_table[256] = { |
830 | 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0, 0x70, 0xF0, |
831 | 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8, 0x38, 0xB8, 0x78, 0xF8, |
832 | 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94, 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, |
833 | 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC, 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, |
834 | 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2, 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, |
835 | 0x0A, 0x8A, 0x4A, 0xCA, 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, |
836 | 0x06, 0x86, 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6, |
837 | 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE, 0x7E, 0xFE, |
838 | 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1, 0x31, 0xB1, 0x71, 0xF1, |
839 | 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99, 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, |
840 | 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5, 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, |
841 | 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD, 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, |
842 | 0x03, 0x83, 0x43, 0xC3, 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, |
843 | 0x0B, 0x8B, 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB, |
844 | 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7, 0x77, 0xF7, |
845 | 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF, 0x3F, 0xBF, 0x7F, 0xFF |
846 | }; |
847 | |
848 | } // namespace internal @endcond |
849 | } // namespace tbb |
850 | |
851 | // Preserving access to legacy APIs |
852 | using tbb::internal::__TBB_load_with_acquire; |
853 | using tbb::internal::__TBB_store_with_release; |
854 | |
855 | // Mapping historically used names to the ones expected by atomic_load_store_traits |
856 | #define __TBB_load_acquire __TBB_load_with_acquire |
857 | #define __TBB_store_release __TBB_store_with_release |
858 | |
859 | #ifndef __TBB_Log2 |
860 | inline intptr_t __TBB_Log2( uintptr_t x ) { |
861 | if( x==0 ) return -1; |
862 | intptr_t result = 0; |
863 | |
864 | #if !defined(_M_ARM) |
865 | uintptr_t tmp_; |
866 | if( sizeof(x)>4 && (tmp_ = ((uint64_t)x)>>32) ) { x=tmp_; result += 32; } |
867 | #endif |
868 | if( uintptr_t tmp = x>>16 ) { x=tmp; result += 16; } |
869 | if( uintptr_t tmp = x>>8 ) { x=tmp; result += 8; } |
870 | if( uintptr_t tmp = x>>4 ) { x=tmp; result += 4; } |
871 | if( uintptr_t tmp = x>>2 ) { x=tmp; result += 2; } |
872 | |
873 | return (x&2)? result+1: result; |
874 | } |
875 | #endif |
876 | |
877 | #ifndef __TBB_AtomicOR |
878 | inline void __TBB_AtomicOR( volatile void *operand, uintptr_t addend ) { |
879 | for( tbb::internal::atomic_backoff b;;b.pause() ) { |
880 | uintptr_t tmp = *(volatile uintptr_t *)operand; |
881 | uintptr_t result = __TBB_CompareAndSwapW(operand, tmp|addend, tmp); |
882 | if( result==tmp ) break; |
883 | } |
884 | } |
885 | #endif |
886 | |
887 | #ifndef __TBB_AtomicAND |
888 | inline void __TBB_AtomicAND( volatile void *operand, uintptr_t addend ) { |
889 | for( tbb::internal::atomic_backoff b;;b.pause() ) { |
890 | uintptr_t tmp = *(volatile uintptr_t *)operand; |
891 | uintptr_t result = __TBB_CompareAndSwapW(operand, tmp&addend, tmp); |
892 | if( result==tmp ) break; |
893 | } |
894 | } |
895 | #endif |
896 | |
897 | #if __TBB_PREFETCHING |
898 | #ifndef __TBB_cl_prefetch |
899 | #error This platform does not define cache management primitives required for __TBB_PREFETCHING |
900 | #endif |
901 | |
902 | #ifndef __TBB_cl_evict |
903 | #define __TBB_cl_evict(p) |
904 | #endif |
905 | #endif |
906 | |
907 | #ifndef __TBB_Flag |
908 | typedef unsigned char __TBB_Flag; |
909 | #endif |
910 | typedef __TBB_atomic __TBB_Flag __TBB_atomic_flag; |
911 | |
912 | #ifndef __TBB_TryLockByte |
913 | inline bool __TBB_TryLockByte( __TBB_atomic_flag &flag ) { |
914 | return __TBB_machine_cmpswp1(&flag,1,0)==0; |
915 | } |
916 | #endif |
917 | |
918 | #ifndef __TBB_LockByte |
919 | inline __TBB_Flag __TBB_LockByte( __TBB_atomic_flag& flag ) { |
920 | tbb::internal::atomic_backoff backoff; |
921 | while( !__TBB_TryLockByte(flag) ) backoff.pause(); |
922 | return 0; |
923 | } |
924 | #endif |
925 | |
926 | #ifndef __TBB_UnlockByte |
927 | #define __TBB_UnlockByte(addr) __TBB_store_with_release((addr),0) |
928 | #endif |
929 | |
930 | // lock primitives with Intel(R) Transactional Synchronization Extensions (Intel(R) TSX) |
931 | #if ( __TBB_x86_32 || __TBB_x86_64 ) /* only on ia32/intel64 */ |
932 | inline void __TBB_TryLockByteElidedCancel() { __TBB_machine_try_lock_elided_cancel(); } |
933 | |
934 | inline bool __TBB_TryLockByteElided( __TBB_atomic_flag& flag ) { |
935 | bool res = __TBB_machine_try_lock_elided( &flag )!=0; |
936 | // to avoid the "lemming" effect, we need to abort the transaction |
937 | // if __TBB_machine_try_lock_elided returns false (i.e., someone else |
938 | // has acquired the mutex non-speculatively). |
939 | if( !res ) __TBB_TryLockByteElidedCancel(); |
940 | return res; |
941 | } |
942 | |
943 | inline void __TBB_LockByteElided( __TBB_atomic_flag& flag ) |
944 | { |
945 | for(;;) { |
946 | tbb::internal::spin_wait_while_eq( flag, 1 ); |
947 | if( __TBB_machine_try_lock_elided( &flag ) ) |
948 | return; |
949 | // Another thread acquired the lock "for real". |
950 | // To avoid the "lemming" effect, we abort the transaction. |
951 | __TBB_TryLockByteElidedCancel(); |
952 | } |
953 | } |
954 | |
955 | inline void __TBB_UnlockByteElided( __TBB_atomic_flag& flag ) { |
956 | __TBB_machine_unlock_elided( &flag ); |
957 | } |
958 | #endif |
959 | |
960 | #ifndef __TBB_ReverseByte |
961 | inline unsigned char __TBB_ReverseByte(unsigned char src) { |
962 | return tbb::internal::reverse<unsigned char>::byte_table[src]; |
963 | } |
964 | #endif |
965 | |
966 | template<typename T> |
967 | T __TBB_ReverseBits(T src) { |
968 | T dst; |
969 | unsigned char *original = (unsigned char *) &src; |
970 | unsigned char *reversed = (unsigned char *) &dst; |
971 | |
972 | for( int i = sizeof(T)-1; i >= 0; i-- ) |
973 | reversed[i] = __TBB_ReverseByte( original[sizeof(T)-i-1] ); |
974 | |
975 | return dst; |
976 | } |
977 | |
978 | #endif /* __TBB_machine_H */ |
979 | |