tbb_machine.h source code [include/tbb/tbb_machine.h]

1	/*
2	Copyright 2005-2013 Intel Corporation. All Rights Reserved.
3
4	This file is part of Threading Building Blocks.
5
6	Threading Building Blocks is free software; you can redistribute it
7	and/or modify it under the terms of the GNU General Public License
8	version 2 as published by the Free Software Foundation.
9
10	Threading Building Blocks is distributed in the hope that it will be
11	useful, but WITHOUT ANY WARRANTY; without even the implied warranty
12	of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License
16	along with Threading Building Blocks; if not, write to the Free Software
17	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19	As a special exception, you may use this file as part of a free software
20	library without restriction. Specifically, if other files instantiate
21	templates or use macros or inline functions from this file, or you compile
22	this file and link it with other files to produce an executable, this
23	file does not by itself cause the resulting executable to be covered by
24	the GNU General Public License. This exception does not however
25	invalidate any other reasons why the executable file might be covered by
26	the GNU General Public License.
27	*/
28
29	#ifndef __TBB_machine_H
30	#define __TBB_machine_H
31
32	/* This header provides basic platform abstraction layer by hooking up appropriate*
33	architecture/OS/compiler specific headers from the /include/tbb/machine directory.
34	If a plug-in header does not implement all the required APIs, it must specify
35	the missing ones by setting one or more of the following macros:
36
37	__TBB_USE_GENERIC_PART_WORD_CAS
38	__TBB_USE_GENERIC_PART_WORD_FETCH_ADD
39	__TBB_USE_GENERIC_PART_WORD_FETCH_STORE
40	__TBB_USE_GENERIC_FETCH_ADD
41	__TBB_USE_GENERIC_FETCH_STORE
42	__TBB_USE_GENERIC_DWORD_FETCH_ADD
43	__TBB_USE_GENERIC_DWORD_FETCH_STORE
44	__TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE
45	__TBB_USE_GENERIC_FULL_FENCED_LOAD_STORE
46	__TBB_USE_GENERIC_RELAXED_LOAD_STORE
47	__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
48
49	In this case tbb_machine.h will add missing functionality based on a minimal set
50	of APIs that are required to be implemented by all plug-n headers as described
51	further.
52	Note that these generic implementations may be sub-optimal for a particular
53	architecture, and thus should be relied upon only after careful evaluation
54	or as the last resort.
55
56	Additionally __TBB_64BIT_ATOMICS can be set to 0 on a 32-bit architecture to
57	indicate that the port is not going to support double word atomics. It may also
58	be set to 1 explicitly, though normally this is not necessary as tbb_machine.h
59	will set it automatically.
60
61	__TBB_ENDIANNESS macro can be defined by the implementation as well.
62	It is used only if __TBB_USE_GENERIC_PART_WORD_CAS is set (or for testing),
63	and must specify the layout of aligned 16-bit and 32-bit data anywhere within a process
64	(while the details of unaligned 16-bit or 32-bit data or of 64-bit data are irrelevant).
65	The layout must be the same at all relevant memory locations within the current process;
66	in case of page-specific endianness, one endianness must be kept "out of sight".
67	Possible settings, reflecting hardware and possibly O.S. convention, are:
68	- __TBB_ENDIAN_BIG for big-endian data,
69	- __TBB_ENDIAN_LITTLE for little-endian data,
70	- __TBB_ENDIAN_DETECT for run-time detection iff exactly one of the above,
71	- __TBB_ENDIAN_UNSUPPORTED to prevent undefined behavior if none of the above.
72
73	Prerequisites for each architecture port
74	----------------------------------------
75	The following functions and macros have no generic implementation. Therefore they must be
76	implemented in each machine architecture specific header either as a conventional
77	function or as a functional macro.
78
79	__TBB_WORDSIZE
80	This is the size of machine word in bytes, i.e. for 32 bit systems it
81	should be defined to 4.
82
83	__TBB_Yield()
84	Signals OS that the current thread is willing to relinquish the remainder
85	of its time quantum.
86
87	__TBB_full_memory_fence()
88	Must prevent all memory operations from being reordered across it (both
89	by hardware and compiler). All such fences must be totally ordered (or
90	sequentially consistent).
91
92	__TBB_machine_cmpswp4( volatile void ptr, int32_t value, int32_t comparand )*
93	Must be provided if __TBB_USE_FENCED_ATOMICS is not set.
94
95	__TBB_machine_cmpswp8( volatile void ptr, int32_t value, int64_t comparand )*
96	Must be provided for 64-bit architectures if __TBB_USE_FENCED_ATOMICS is not set,
97	and for 32-bit architectures if __TBB_64BIT_ATOMICS is set
98
99	__TBB_machine_<op><S><fence>(...), where
100	<op> = {cmpswp, fetchadd, fetchstore}
101	<S> = {1, 2, 4, 8}
102	<fence> = {full_fence, acquire, release, relaxed}
103	Must be provided if __TBB_USE_FENCED_ATOMICS is set.
104
105	__TBB_control_consistency_helper()
106	Bridges the memory-semantics gap between architectures providing only
107	implicit C++0x "consume" semantics (like Power Architecture) and those
108	also implicitly obeying control dependencies (like IA-64 architecture).
109	It must be used only in conditional code where the condition is itself
110	data-dependent, and will then make subsequent code behave as if the
111	original data dependency were acquired.
112	It needs only a compiler fence where implied by the architecture
113	either specifically (like IA-64 architecture) or because generally stronger
114	"acquire" semantics are enforced (like x86).
115	It is always valid, though potentially suboptimal, to replace
116	control with acquire on the load and then remove the helper.
117
118	__TBB_acquire_consistency_helper(), __TBB_release_consistency_helper()
119	Must be provided if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE is set.
120	Enforce acquire and release semantics in generic implementations of fenced
121	store and load operations. Depending on the particular architecture/compiler
122	combination they may be a hardware fence, a compiler fence, both or nothing.
123	**/
124
125	#include "tbb_stddef.h"
126
127	namespace tbb {
128	namespace internal {
129
130	////////////////////////////////////////////////////////////////////////////////
131	// Overridable helpers declarations
132	//
133	// A machine/.h file may choose to define these templates, otherwise it must*
134	// request default implementation by setting appropriate __TBB_USE_GENERIC_XXX macro(s).
135	//
136	template <typename T, std::size_t S>
137	struct machine_load_store;
138
139	template <typename T, std::size_t S>
140	struct machine_load_store_relaxed;
141
142	template <typename T, std::size_t S>
143	struct machine_load_store_seq_cst;
144	//
145	// End of overridable helpers declarations
146	////////////////////////////////////////////////////////////////////////////////
147
148	template<size_t S> struct atomic_selector;
149
150	template<> struct atomic_selector<`1`> {
151	typedef int8_t word;
152	inline static word fetch_store ( volatile void* location, word value );
153	};
154
155	template<> struct atomic_selector<`2`> {
156	typedef int16_t word;
157	inline static word fetch_store ( volatile void* location, word value );
158	};
159
160	template<> struct atomic_selector<`4`> {
161	#if _MSC_VER && !_WIN64
162	// Work-around that avoids spurious /Wp64 warnings
163	typedef intptr_t word;
164	#else
165	typedef int32_t word;
166	#endif
167	inline static word fetch_store ( volatile void* location, word value );
168	};
169
170	template<> struct atomic_selector<`8`> {
171	typedef int64_t word;
172	inline static word fetch_store ( volatile void* location, word value );
173	};
174
175	}} // namespaces internal, tbb
176
177	#define __TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(M) \
178	inline void __TBB_machine_generic_store8##M(volatile void *ptr, int64_t value) { \
179	for(;;) { \
180	int64_t result = (int64_t )ptr; \
181	if( __TBB_machine_cmpswp8##M(ptr,value,result)==result ) break; \
182	} \
183	} \
184
185	#define __TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(M) \
186	inline int64_t __TBB_machine_generic_load8##M(const volatile void *ptr) { \
187	/* Comparand and new value may be anything, they only must be equal, and */ \
188	/* the value should have a low probability to be actually found in 'location'.*/ \
189	const int64_t anyvalue = 2305843009213693951LL; \
190	return __TBB_machine_cmpswp8##M(const_cast<volatile void *>(ptr),anyvalue,anyvalue); \
191	} \
192
193	// The set of allowed values for __TBB_ENDIANNESS (see above for details)
194	#define __TBB_ENDIAN_UNSUPPORTED -1
195	#define __TBB_ENDIAN_LITTLE 0
196	#define __TBB_ENDIAN_BIG 1
197	#define __TBB_ENDIAN_DETECT 2
198
199	#if _WIN32\|\|_WIN64
200
201	#ifdef _MANAGED
202	#pragma managed(push, off)
203	#endif
204
205	#if __MINGW64__ \|\| __MINGW32__
206	extern "C" __declspec(dllimport) int __stdcall SwitchToThread( void );
207	#define __TBB_Yield() SwitchToThread()
208	#if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT)
209	#include "machine/gcc_generic.h"
210	#elif __MINGW64__
211	#include "machine/linux_intel64.h"
212	#elif __MINGW32__
213	#include "machine/linux_ia32.h"
214	#endif
215	#elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT)
216	#include "machine/icc_generic.h"
217	#elif defined(_M_IX86) && !defined(__TBB_WIN32_USE_CL_BUILTINS)
218	#include "machine/windows_ia32.h"
219	#elif defined(_M_X64)
220	#include "machine/windows_intel64.h"
221	#elif defined(_XBOX)
222	#include "machine/xbox360_ppc.h"
223	#elif defined(_M_ARM) \|\| defined(__TBB_WIN32_USE_CL_BUILTINS)
224	#include "machine/msvc_armv7.h"
225	#endif
226
227	#ifdef _MANAGED
228	#pragma managed(pop)
229	#endif
230
231	#elif __TBB_DEFINE_MIC
232
233	#include "machine/mic_common.h"
234	//TODO: check if ICC atomic intrinsics are available for MIC
235	#include "machine/linux_intel64.h"
236
237	#elif __linux__ \|\| __FreeBSD__ \|\| __NetBSD__
238
239	#if (TBB_USE_GCC_BUILTINS && __TBB_GCC_BUILTIN_ATOMICS_PRESENT)
240	#include "machine/gcc_generic.h"
241	#elif (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT)
242	#include "machine/icc_generic.h"
243	#elif __i386__
244	#include "machine/linux_ia32.h"
245	#elif __x86_64__
246	#include "machine/linux_intel64.h"
247	#elif __ia64__
248	#include "machine/linux_ia64.h"
249	#elif __powerpc__
250	#include "machine/mac_ppc.h"
251	#elif __arm__
252	#include "machine/gcc_armv7.h"
253	#elif __TBB_GCC_BUILTIN_ATOMICS_PRESENT
254	#include "machine/gcc_generic.h"
255	#endif
256	#include "machine/linux_common.h"
257
258	#elif __APPLE__
259	//TODO: TBB_USE_GCC_BUILTINS is not used for Mac, Sun, Aix
260	#if (TBB_USE_ICC_BUILTINS && __TBB_ICC_BUILTIN_ATOMICS_PRESENT)
261	#include "machine/icc_generic.h"
262	#elif __i386__
263	#include "machine/linux_ia32.h"
264	#elif __x86_64__
265	#include "machine/linux_intel64.h"
266	#elif __POWERPC__
267	#include "machine/mac_ppc.h"
268	#endif
269	#include "machine/macos_common.h"
270
271	#elif _AIX
272
273	#include "machine/ibm_aix51.h"
274
275	#elif __sun \|\| __SUNPRO_CC
276
277	#define __asm__ asm
278	#define __volatile__ volatile
279
280	#if __i386 \|\| __i386__
281	#include "machine/linux_ia32.h"
282	#elif __x86_64__
283	#include "machine/linux_intel64.h"
284	#elif __sparc
285	#include "machine/sunos_sparc.h"
286	#endif
287	#include <sched.h>
288
289	#define __TBB_Yield() sched_yield()
290
291	#endif /* OS selection */
292
293	#ifndef __TBB_64BIT_ATOMICS
294	#define __TBB_64BIT_ATOMICS 1
295	#endif
296
297	//TODO: replace usage of these functions with usage of tbb::atomic, and then remove them
298	//TODO: map functions with W suffix to use cast to tbb::atomic and according op, i.e. as_atomic().op()
299	// Special atomic functions
300	#if __TBB_USE_FENCED_ATOMICS
301	#define __TBB_machine_cmpswp1 __TBB_machine_cmpswp1full_fence
302	#define __TBB_machine_cmpswp2 __TBB_machine_cmpswp2full_fence
303	#define __TBB_machine_cmpswp4 __TBB_machine_cmpswp4full_fence
304	#define __TBB_machine_cmpswp8 __TBB_machine_cmpswp8full_fence
305
306	#if __TBB_WORDSIZE==8
307	#define __TBB_machine_fetchadd8 __TBB_machine_fetchadd8full_fence
308	#define __TBB_machine_fetchstore8 __TBB_machine_fetchstore8full_fence
309	#define __TBB_FetchAndAddWrelease(P,V) __TBB_machine_fetchadd8release(P,V)
310	#define __TBB_FetchAndIncrementWacquire(P) __TBB_machine_fetchadd8acquire(P,1)
311	#define __TBB_FetchAndDecrementWrelease(P) __TBB_machine_fetchadd8release(P,(-1))
312	#else
313	#define __TBB_machine_fetchadd4 __TBB_machine_fetchadd4full_fence
314	#define __TBB_machine_fetchstore4 __TBB_machine_fetchstore4full_fence
315	#define __TBB_FetchAndAddWrelease(P,V) __TBB_machine_fetchadd4release(P,V)
316	#define __TBB_FetchAndIncrementWacquire(P) __TBB_machine_fetchadd4acquire(P,1)
317	#define __TBB_FetchAndDecrementWrelease(P) __TBB_machine_fetchadd4release(P,(-1))
318	#endif /* __TBB_WORDSIZE==4 */
319	#else /* !__TBB_USE_FENCED_ATOMICS */
320	#define __TBB_FetchAndAddWrelease(P,V) __TBB_FetchAndAddW(P,V)
321	#define __TBB_FetchAndIncrementWacquire(P) __TBB_FetchAndAddW(P,1)
322	#define __TBB_FetchAndDecrementWrelease(P) __TBB_FetchAndAddW(P,(-1))
323	#endif /* !__TBB_USE_FENCED_ATOMICS */
324
325	#if __TBB_WORDSIZE==4
326	#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp4(P,V,C)
327	#define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd4(P,V)
328	#define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstore4(P,V)
329	#elif __TBB_WORDSIZE==8
330	#if __TBB_USE_GENERIC_DWORD_LOAD_STORE \|\| __TBB_USE_GENERIC_DWORD_FETCH_ADD \|\| __TBB_USE_GENERIC_DWORD_FETCH_STORE
331	#error These macros should only be used on 32-bit platforms.
332	#endif
333
334	#define __TBB_CompareAndSwapW(P,V,C) __TBB_machine_cmpswp8(P,V,C)
335	#define __TBB_FetchAndAddW(P,V) __TBB_machine_fetchadd8(P,V)
336	#define __TBB_FetchAndStoreW(P,V) __TBB_machine_fetchstore8(P,V)
337	#else /* __TBB_WORDSIZE != 8 */
338	#error Unsupported machine word size.
339	#endif /* __TBB_WORDSIZE */
340
341	#ifndef __TBB_Pause
342	inline void __TBB_Pause(int32_t) {
343	__TBB_Yield();
344	}
345	#endif
346
347	namespace tbb {
348
349	//! Sequentially consistent full memory fence.
350	inline void atomic_fence () { __TBB_full_memory_fence(); }
351
352	namespace internal {
353
354	//! Class that implements exponential backoff.
355	/* See implementation of spin_wait_while_eq for an example. /
356	class atomic_backoff : no_copy {
357	//! Time delay, in units of "pause" instructions.
358	/* Should be equal to approximately the number of "pause" instructions*
359	that take the same time as an context switch. /*
360	static const int32_t LOOPS_BEFORE_YIELD = `16`;
361	int32_t count;
362	public:
363	// In many cases, an object of this type is initialized eagerly on hot path,
364	// as in for(atomic_backoff b; ; b.pause()) { /loop body/ }
365	// For this reason, the construction cost must be very small!
366	atomic_backoff() : count(`1`) {}
367	// This constructor pauses immediately; do not use on hot paths!
368	atomic_backoff( bool ) : count(`1`) { pause(); }
369
370	//! Pause for a while.
371	void pause() {
372	if( count<=LOOPS_BEFORE_YIELD ) {
373	__TBB_Pause(count);
374	// Pause twice as long the next time.
375	count*=`2`;
376	} else {
377	// Pause is so long that we might as well yield CPU to scheduler.
378	__TBB_Yield();
379	}
380	}
381
382	// pause for a few times and then return false immediately.
383	bool bounded_pause() {
384	if( count<=LOOPS_BEFORE_YIELD ) {
385	__TBB_Pause(count);
386	// Pause twice as long the next time.
387	count*=`2`;
388	return true;
389	} else {
390	return false;
391	}
392	}
393
394	void reset() {
395	count = `1`;
396	}
397	};
398
399	//! Spin WHILE the value of the variable is equal to a given value
400	/* T and U should be comparable types. /
401	template<typename T, typename U>
402	void spin_wait_while_eq( const volatile T& location, U value ) {
403	atomic_backoff backoff;
404	while( location==value ) backoff.pause();
405	}
406
407	//! Spin UNTIL the value of the variable is equal to a given value
408	/* T and U should be comparable types. /
409	template<typename T, typename U>
410	void spin_wait_until_eq( const volatile T& location, const U value ) {
411	atomic_backoff backoff;
412	while( location!=value ) backoff.pause();
413	}
414
415
416	////////////////////////////////////////////////////////////////////////////////
417	// Generic compare-and-swap applied to only a part of a machine word.
418	//
419	#ifndef __TBB_ENDIANNESS
420	#define __TBB_ENDIANNESS __TBB_ENDIAN_DETECT
421	#endif
422
423	#if __TBB_USE_GENERIC_PART_WORD_CAS && __TBB_ENDIANNESS==__TBB_ENDIAN_UNSUPPORTED
424	#error Generic implementation of part-word CAS may not be used with __TBB_ENDIAN_UNSUPPORTED
425	#endif
426
427	#if __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED
428	//
429	// This function is the only use of __TBB_ENDIANNESS.
430	// The following restrictions/limitations apply for this operation:
431	// - T must be an integer type of at most 4 bytes for the casts and calculations to work
432	// - T must also be less than 4 bytes to avoid compiler warnings when computing mask
433	// (and for the operation to be useful at all, so no workaround is applied)
434	// - the architecture must consistently use either little-endian or big-endian (same for all locations)
435	//
436	// TODO: static_assert for the type requirements stated above
437	template<typename T>
438	inline T __TBB_MaskedCompareAndSwap (volatile T * const ptr, const T value, const T comparand ) {
439	struct endianness{ static bool is_big_endian(){
440	#if __TBB_ENDIANNESS==__TBB_ENDIAN_DETECT
441	const uint32_t probe = `0x03020100`;
442	return (((const char*)(&probe))[`0`]==`0x03`);
443	#elif __TBB_ENDIANNESS==__TBB_ENDIAN_BIG \|\| __TBB_ENDIANNESS==__TBB_ENDIAN_LITTLE
444	return __TBB_ENDIANNESS==__TBB_ENDIAN_BIG;
445	#else
446	#error Unexpected value of __TBB_ENDIANNESS
447	#endif
448	}};
449
450	const uint32_t byte_offset = (uint32_t) ((uintptr_t)ptr & `0x3`);
451	volatile uint32_t * const aligned_ptr = (uint32_t*)((uintptr_t)ptr - byte_offset );
452
453	// location of T within uint32_t for a C++ shift operation
454	const uint32_t bits_to_shift = `8`(endianness::is_big_endian() ? (`4` - sizeof*(T) - (byte_offset)) : byte_offset);
455	const uint32_t mask = (((uint32_t)`1`<<(sizeof(T)*`8`)) - `1` )<<bits_to_shift;
456	// for signed T, any sign extension bits in cast value/comparand are immediately clipped by mask
457	const uint32_t shifted_comparand = ((uint32_t)comparand << bits_to_shift)&mask;
458	const uint32_t shifted_value = ((uint32_t)value << bits_to_shift)&mask;
459
460	for( atomic_backoff b;;b.pause() ) {
461	const uint32_t surroundings = aligned_ptr & ~mask ; // may have changed during the pause*
462	const uint32_t big_comparand = surroundings \| shifted_comparand ;
463	const uint32_t big_value = surroundings \| shifted_value ;
464	// __TBB_machine_cmpswp4 presumed to have full fence.
465	// Cast shuts up /Wp64 warning
466	const uint32_t big_result = (uint32_t)__TBB_machine_cmpswp4( aligned_ptr, big_value, big_comparand );
467	if( big_result == big_comparand // CAS succeeded
468	\|\| ((big_result ^ big_comparand) & mask) != `0`) // CAS failed and the bits of interest have changed
469	{
470	return T((big_result & mask) >> bits_to_shift);
471	}
472	else continue; // CAS failed but the bits of interest were not changed
473	}
474	}
475	#endif // __TBB_ENDIANNESS!=__TBB_ENDIAN_UNSUPPORTED
476	////////////////////////////////////////////////////////////////////////////////
477
478	template<size_t S, typename T>
479	inline T __TBB_CompareAndSwapGeneric (volatile void *ptr, T value, T comparand );
480
481	template<>
482	inline uint8_t __TBB_CompareAndSwapGeneric <`1`,uint8_t> (volatile void *ptr, uint8_t value, uint8_t comparand ) {
483	#if __TBB_USE_GENERIC_PART_WORD_CAS
484	return __TBB_MaskedCompareAndSwap<uint8_t>((volatile uint8_t *)ptr,value,comparand);
485	#else
486	return __TBB_machine_cmpswp1(ptr,value,comparand);
487	#endif
488	}
489
490	template<>
491	inline uint16_t __TBB_CompareAndSwapGeneric <`2`,uint16_t> (volatile void *ptr, uint16_t value, uint16_t comparand ) {
492	#if __TBB_USE_GENERIC_PART_WORD_CAS
493	return __TBB_MaskedCompareAndSwap<uint16_t>((volatile uint16_t *)ptr,value,comparand);
494	#else
495	return __TBB_machine_cmpswp2(ptr,value,comparand);
496	#endif
497	}
498
499	template<>
500	inline uint32_t __TBB_CompareAndSwapGeneric <`4`,uint32_t> (volatile void *ptr, uint32_t value, uint32_t comparand ) {
501	// Cast shuts up /Wp64 warning
502	return (uint32_t)__TBB_machine_cmpswp4(ptr,value,comparand);
503	}
504
505	#if __TBB_64BIT_ATOMICS
506	template<>
507	inline uint64_t __TBB_CompareAndSwapGeneric <`8`,uint64_t> (volatile void *ptr, uint64_t value, uint64_t comparand ) {
508	return __TBB_machine_cmpswp8(ptr,value,comparand);
509	}
510	#endif
511
512	template<size_t S, typename T>
513	inline T __TBB_FetchAndAddGeneric (volatile void *ptr, T addend) {
514	T result;
515	for( atomic_backoff b;;b.pause() ) {
516	result = *reinterpret_cast<volatile T *>(ptr);
517	// __TBB_CompareAndSwapGeneric presumed to have full fence.
518	if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, result+addend, result )==result )
519	break;
520	}
521	return result;
522	}
523
524	template<size_t S, typename T>
525	inline T __TBB_FetchAndStoreGeneric (volatile void *ptr, T value) {
526	T result;
527	for( atomic_backoff b;;b.pause() ) {
528	result = *reinterpret_cast<volatile T *>(ptr);
529	// __TBB_CompareAndSwapGeneric presumed to have full fence.
530	if( __TBB_CompareAndSwapGeneric<S,T> ( ptr, value, result )==result )
531	break;
532	}
533	return result;
534	}
535
536	#if __TBB_USE_GENERIC_PART_WORD_CAS
537	#define __TBB_machine_cmpswp1 tbb::internal::__TBB_CompareAndSwapGeneric<1,uint8_t>
538	#define __TBB_machine_cmpswp2 tbb::internal::__TBB_CompareAndSwapGeneric<2,uint16_t>
539	#endif
540
541	#if __TBB_USE_GENERIC_FETCH_ADD \|\| __TBB_USE_GENERIC_PART_WORD_FETCH_ADD
542	#define __TBB_machine_fetchadd1 tbb::internal::__TBB_FetchAndAddGeneric<1,uint8_t>
543	#define __TBB_machine_fetchadd2 tbb::internal::__TBB_FetchAndAddGeneric<2,uint16_t>
544	#endif
545
546	#if __TBB_USE_GENERIC_FETCH_ADD
547	#define __TBB_machine_fetchadd4 tbb::internal::__TBB_FetchAndAddGeneric<4,uint32_t>
548	#endif
549
550	#if __TBB_USE_GENERIC_FETCH_ADD \|\| __TBB_USE_GENERIC_DWORD_FETCH_ADD
551	#define __TBB_machine_fetchadd8 tbb::internal::__TBB_FetchAndAddGeneric<8,uint64_t>
552	#endif
553
554	#if __TBB_USE_GENERIC_FETCH_STORE \|\| __TBB_USE_GENERIC_PART_WORD_FETCH_STORE
555	#define __TBB_machine_fetchstore1 tbb::internal::__TBB_FetchAndStoreGeneric<1,uint8_t>
556	#define __TBB_machine_fetchstore2 tbb::internal::__TBB_FetchAndStoreGeneric<2,uint16_t>
557	#endif
558
559	#if __TBB_USE_GENERIC_FETCH_STORE
560	#define __TBB_machine_fetchstore4 tbb::internal::__TBB_FetchAndStoreGeneric<4,uint32_t>
561	#endif
562
563	#if __TBB_USE_GENERIC_FETCH_STORE \|\| __TBB_USE_GENERIC_DWORD_FETCH_STORE
564	#define __TBB_machine_fetchstore8 tbb::internal::__TBB_FetchAndStoreGeneric<8,uint64_t>
565	#endif
566
567	#if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
568	#define __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(S) \
569	atomic_selector<S>::word atomic_selector<S>::fetch_store ( volatile void* location, word value ) { \
570	return __TBB_machine_fetchstore##S( location, value ); \
571	}
572
573	__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(`1`)
574	__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(`2`)
575	__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(`4`)
576	__TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE(`8`)
577
578	#undef __TBB_MACHINE_DEFINE_ATOMIC_SELECTOR_FETCH_STORE
579	#endif /* __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
580
581	#if __TBB_USE_GENERIC_DWORD_LOAD_STORE
582	/TODO: find a more elegant way to handle function names difference/
583	#if ! __TBB_USE_FENCED_ATOMICS
584	/ This name forwarding is needed for generic implementation of*
585	* load8/store8 defined below (via macro) to pick the right CAS function*/
586	#define __TBB_machine_cmpswp8full_fence __TBB_machine_cmpswp8
587	#endif
588	__TBB_MACHINE_DEFINE_LOAD8_GENERIC_FENCED(full_fence)
589	__TBB_MACHINE_DEFINE_STORE8_GENERIC_FENCED(full_fence)
590
591	#if ! __TBB_USE_FENCED_ATOMICS
592	#undef __TBB_machine_cmpswp8full_fence
593	#endif
594
595	#define __TBB_machine_store8 tbb::internal::__TBB_machine_generic_store8full_fence
596	#define __TBB_machine_load8 tbb::internal::__TBB_machine_generic_load8full_fence
597	#endif /* __TBB_USE_GENERIC_DWORD_LOAD_STORE */
598
599	#if __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE
600	/* Fenced operations use volatile qualifier to prevent compiler from optimizing*
601	them out, and on architectures with weak memory ordering to induce compiler
602	to generate code with appropriate acquire/release semantics.
603	On architectures like IA32, Intel64 (and likely Sparc TSO) volatile has
604	no effect on code gen, and consistency helpers serve as a compiler fence (the
605	latter being true for IA64/gcc as well to fix a bug in some gcc versions).
606	This code assumes that the generated instructions will operate atomically,
607	which typically requires a type that can be moved in a single instruction,
608	cooperation from the compiler for effective use of such an instruction,
609	and appropriate alignment of the data. /
610	template <typename T, size_t S>
611	struct machine_load_store {
612	static T load_with_acquire ( const volatile T& location ) {
613	T to_return = location;
614	__TBB_acquire_consistency_helper();
615	return to_return;
616	}
617	static void store_with_release ( volatile T &location, T value ) {
618	__TBB_release_consistency_helper();
619	location = value;
620	}
621	};
622
623	//in general, plain load and store of 32bit compiler is not atomic for 64bit types
624	#if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
625	template <typename T>
626	struct machine_load_store<T,`8`> {
627	static T load_with_acquire ( const volatile T& location ) {
628	return (T)__TBB_machine_load8( (const volatile void*)&location );
629	}
630	static void store_with_release ( volatile T& location, T value ) {
631	__TBB_machine_store8( (volatile void*)&location, (int64_t)value );
632	}
633	};
634	#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
635	#endif /* __TBB_USE_GENERIC_HALF_FENCED_LOAD_STORE */
636
637	#if __TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE
638	template <typename T, size_t S>
639	struct machine_load_store_seq_cst {
640	static T load ( const volatile T& location ) {
641	__TBB_full_memory_fence();
642	return machine_load_store<T,S>::load_with_acquire( location );
643	}
644	#if __TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE
645	static void store ( volatile T &location, T value ) {
646	atomic_selector<S>::fetch_store( (volatile void)&location, (typename* atomic_selector<S>::word)value );
647	}
648	#else /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
649	static void store ( volatile T &location, T value ) {
650	machine_load_store<T,S>::store_with_release( location, value );
651	__TBB_full_memory_fence();
652	}
653	#endif /* !__TBB_USE_FETCHSTORE_AS_FULL_FENCED_STORE */
654	};
655
656	#if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
657	/* The implementation does not use functions __TBB_machine_load8/store8 as they*
658	are not required to be sequentially consistent. /
659	template <typename T>
660	struct machine_load_store_seq_cst<T,`8`> {
661	static T load ( const volatile T& location ) {
662	// Comparand and new value may be anything, they only must be equal, and
663	// the value should have a low probability to be actually found in 'location'.
664	const int64_t anyvalue = `2305843009213693951LL`;
665	return __TBB_machine_cmpswp8( (volatile void)const_cast<volatile* T*>(&location), anyvalue, anyvalue );
666	}
667	static void store ( volatile T &location, T value ) {
668	int64_t result = (volatile int64_t&)location;
669	while ( __TBB_machine_cmpswp8((volatile void*)&location, (int64_t)value, result) != result )
670	result = (volatile int64_t&)location;
671	}
672	};
673	#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
674	#endif /__TBB_USE_GENERIC_SEQUENTIAL_CONSISTENCY_LOAD_STORE /
675
676	#if __TBB_USE_GENERIC_RELAXED_LOAD_STORE
677	// Relaxed operations add volatile qualifier to prevent compiler from optimizing them out.
678	/* Volatile should not incur any additional cost on IA32, Intel64, and Sparc TSO*
679	architectures. However on architectures with weak memory ordering compiler may
680	generate code with acquire/release semantics for operations on volatile data. /
681	template <typename T, size_t S>
682	struct machine_load_store_relaxed {
683	static inline T load ( const volatile T& location ) {
684	return location;
685	}
686	static inline void store ( volatile T& location, T value ) {
687	location = value;
688	}
689	};
690
691	#if __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS
692	template <typename T>
693	struct machine_load_store_relaxed<T,`8`> {
694	static inline T load ( const volatile T& location ) {
695	return (T)__TBB_machine_load8( (const volatile void*)&location );
696	}
697	static inline void store ( volatile T& location, T value ) {
698	__TBB_machine_store8( (volatile void*)&location, (int64_t)value );
699	}
700	};
701	#endif /* __TBB_WORDSIZE==4 && __TBB_64BIT_ATOMICS */
702	#endif /* __TBB_USE_GENERIC_RELAXED_LOAD_STORE */
703
704	#undef __TBB_WORDSIZE //this macro is forbidden to use outside of atomic machinery
705
706	template<typename T>
707	inline T __TBB_load_with_acquire(const volatile T &location) {
708	return machine_load_store<T,sizeof(T)>::load_with_acquire( location );
709	}
710	template<typename T, typename V>
711	inline void __TBB_store_with_release(volatile T& location, V value) {
712	machine_load_store<T,sizeof(T)>::store_with_release( location, T(value) );
713	}
714	//! Overload that exists solely to avoid /Wp64 warnings.
715	inline void __TBB_store_with_release(volatile size_t& location, size_t value) {
716	machine_load_store<size_t,sizeof(size_t)>::store_with_release( location, value );
717	}
718
719	template<typename T>
720	inline T __TBB_load_full_fence(const volatile T &location) {
721	return machine_load_store_seq_cst<T,sizeof(T)>::load( location );
722	}
723	template<typename T, typename V>
724	inline void __TBB_store_full_fence(volatile T& location, V value) {
725	machine_load_store_seq_cst<T,sizeof(T)>::store( location, T(value) );
726	}
727	//! Overload that exists solely to avoid /Wp64 warnings.
728	inline void __TBB_store_full_fence(volatile size_t& location, size_t value) {
729	machine_load_store_seq_cst<size_t,sizeof(size_t)>::store( location, value );
730	}
731
732	template<typename T>
733	inline T __TBB_load_relaxed (const volatile T& location) {
734	return machine_load_store_relaxed<T,sizeof(T)>::load( const_cast<T&>(location) );
735	}
736	template<typename T, typename V>
737	inline void __TBB_store_relaxed ( volatile T& location, V value ) {
738	machine_load_store_relaxed<T,sizeof(T)>::store( const_cast<T&>(location), T(value) );
739	}
740	//! Overload that exists solely to avoid /Wp64 warnings.
741	inline void __TBB_store_relaxed ( volatile size_t& location, size_t value ) {
742	machine_load_store_relaxed<size_t,sizeof(size_t)>::store( const_cast<size_t&>(location), value );
743	}
744
745	// Macro __TBB_TypeWithAlignmentAtLeastAsStrict(T) should be a type with alignment at least as
746	// strict as type T. The type should have a trivial default constructor and destructor, so that
747	// arrays of that type can be declared without initializers.
748	// It is correct (but perhaps a waste of space) if __TBB_TypeWithAlignmentAtLeastAsStrict(T) expands
749	// to a type bigger than T.
750	// The default definition here works on machines where integers are naturally aligned and the
751	// strictest alignment is 64.
752	#ifndef __TBB_TypeWithAlignmentAtLeastAsStrict
753
754	#if __TBB_ATTRIBUTE_ALIGNED_PRESENT
755
756	#define __TBB_DefineTypeWithAlignment(PowerOf2) \
757	struct __TBB_machine_type_with_alignment_##PowerOf2 { \
758	uint32_t member[PowerOf2/sizeof(uint32_t)]; \
759	} __attribute__((aligned(PowerOf2)));
760	#define __TBB_alignof(T) __alignof__(T)
761
762	#elif __TBB_DECLSPEC_ALIGN_PRESENT
763
764	#define __TBB_DefineTypeWithAlignment(PowerOf2) \
765	__declspec(align(PowerOf2)) \
766	struct __TBB_machine_type_with_alignment_##PowerOf2 { \
767	uint32_t member[PowerOf2/sizeof(uint32_t)]; \
768	};
769	#define __TBB_alignof(T) __alignof(T)
770
771	#else /* A compiler with unknown syntax for data alignment */
772	#error Must define __TBB_TypeWithAlignmentAtLeastAsStrict(T)
773	#endif
774
775	/ Now declare types aligned to useful powers of two /
776	// TODO: Is __TBB_DefineTypeWithAlignment(8) needed on 32 bit platforms?
777	__TBB_DefineTypeWithAlignment(`16`)
778	__TBB_DefineTypeWithAlignment(`32`)
779	__TBB_DefineTypeWithAlignment(`64`)
780
781	typedef __TBB_machine_type_with_alignment_64 __TBB_machine_type_with_strictest_alignment;
782
783	// Primary template is a declaration of incomplete type so that it fails with unknown alignments
784	template<size_t N> struct type_with_alignment;
785
786	// Specializations for allowed alignments
787	template<> struct type_with_alignment<`1`> { char member; };
788	template<> struct type_with_alignment<`2`> { uint16_t member; };
789	template<> struct type_with_alignment<`4`> { uint32_t member; };
790	template<> struct type_with_alignment<`8`> { uint64_t member; };
791	template<> struct type_with_alignment<`16`> {__TBB_machine_type_with_alignment_16 member; };
792	template<> struct type_with_alignment<`32`> {__TBB_machine_type_with_alignment_32 member; };
793	template<> struct type_with_alignment<`64`> {__TBB_machine_type_with_alignment_64 member; };
794
795	#if __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN
796	//! Work around for bug in GNU 3.2 and MSVC compilers.
797	/* Bug is that compiler sometimes returns 0 for __alignof(T) when T has not yet been instantiated.*
798	The work-around forces instantiation by forcing computation of sizeof(T) before __alignof(T). /*
799	template<size_t Size, typename T>
800	struct work_around_alignment_bug {
801	static const size_t alignment = __TBB_alignof(T);
802	};
803	#define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<tbb::internal::work_around_alignment_bug<sizeof(T),T>::alignment>
804	#else
805	#define __TBB_TypeWithAlignmentAtLeastAsStrict(T) tbb::internal::type_with_alignment<__TBB_alignof(T)>
806	#endif /* __TBB_ALIGNOF_NOT_INSTANTIATED_TYPES_BROKEN */
807
808	#endif /* __TBB_TypeWithAlignmentAtLeastAsStrict */
809
810	// Template class here is to avoid instantiation of the static data for modules that don't use it
811	template<typename T>
812	struct reverse {
813	static const T byte_table[`256`];
814	};
815	// An efficient implementation of the reverse function utilizes a 2^8 lookup table holding the bit-reversed
816	// values of [0..2^8 - 1]. Those values can also be computed on the fly at a slightly higher cost.
817	template<typename T>
818	const T reverse<T>::byte_table[`256`] = {
819	`0x00`, `0x80`, `0x40`, `0xC0`, `0x20`, `0xA0`, `0x60`, `0xE0`, `0x10`, `0x90`, `0x50`, `0xD0`, `0x30`, `0xB0`, `0x70`, `0xF0`,
820	`0x08`, `0x88`, `0x48`, `0xC8`, `0x28`, `0xA8`, `0x68`, `0xE8`, `0x18`, `0x98`, `0x58`, `0xD8`, `0x38`, `0xB8`, `0x78`, `0xF8`,
821	`0x04`, `0x84`, `0x44`, `0xC4`, `0x24`, `0xA4`, `0x64`, `0xE4`, `0x14`, `0x94`, `0x54`, `0xD4`, `0x34`, `0xB4`, `0x74`, `0xF4`,
822	`0x0C`, `0x8C`, `0x4C`, `0xCC`, `0x2C`, `0xAC`, `0x6C`, `0xEC`, `0x1C`, `0x9C`, `0x5C`, `0xDC`, `0x3C`, `0xBC`, `0x7C`, `0xFC`,
823	`0x02`, `0x82`, `0x42`, `0xC2`, `0x22`, `0xA2`, `0x62`, `0xE2`, `0x12`, `0x92`, `0x52`, `0xD2`, `0x32`, `0xB2`, `0x72`, `0xF2`,
824	`0x0A`, `0x8A`, `0x4A`, `0xCA`, `0x2A`, `0xAA`, `0x6A`, `0xEA`, `0x1A`, `0x9A`, `0x5A`, `0xDA`, `0x3A`, `0xBA`, `0x7A`, `0xFA`,
825	`0x06`, `0x86`, `0x46`, `0xC6`, `0x26`, `0xA6`, `0x66`, `0xE6`, `0x16`, `0x96`, `0x56`, `0xD6`, `0x36`, `0xB6`, `0x76`, `0xF6`,
826	`0x0E`, `0x8E`, `0x4E`, `0xCE`, `0x2E`, `0xAE`, `0x6E`, `0xEE`, `0x1E`, `0x9E`, `0x5E`, `0xDE`, `0x3E`, `0xBE`, `0x7E`, `0xFE`,
827	`0x01`, `0x81`, `0x41`, `0xC1`, `0x21`, `0xA1`, `0x61`, `0xE1`, `0x11`, `0x91`, `0x51`, `0xD1`, `0x31`, `0xB1`, `0x71`, `0xF1`,
828	`0x09`, `0x89`, `0x49`, `0xC9`, `0x29`, `0xA9`, `0x69`, `0xE9`, `0x19`, `0x99`, `0x59`, `0xD9`, `0x39`, `0xB9`, `0x79`, `0xF9`,
829	`0x05`, `0x85`, `0x45`, `0xC5`, `0x25`, `0xA5`, `0x65`, `0xE5`, `0x15`, `0x95`, `0x55`, `0xD5`, `0x35`, `0xB5`, `0x75`, `0xF5`,
830	`0x0D`, `0x8D`, `0x4D`, `0xCD`, `0x2D`, `0xAD`, `0x6D`, `0xED`, `0x1D`, `0x9D`, `0x5D`, `0xDD`, `0x3D`, `0xBD`, `0x7D`, `0xFD`,
831	`0x03`, `0x83`, `0x43`, `0xC3`, `0x23`, `0xA3`, `0x63`, `0xE3`, `0x13`, `0x93`, `0x53`, `0xD3`, `0x33`, `0xB3`, `0x73`, `0xF3`,
832	`0x0B`, `0x8B`, `0x4B`, `0xCB`, `0x2B`, `0xAB`, `0x6B`, `0xEB`, `0x1B`, `0x9B`, `0x5B`, `0xDB`, `0x3B`, `0xBB`, `0x7B`, `0xFB`,
833	`0x07`, `0x87`, `0x47`, `0xC7`, `0x27`, `0xA7`, `0x67`, `0xE7`, `0x17`, `0x97`, `0x57`, `0xD7`, `0x37`, `0xB7`, `0x77`, `0xF7`,
834	`0x0F`, `0x8F`, `0x4F`, `0xCF`, `0x2F`, `0xAF`, `0x6F`, `0xEF`, `0x1F`, `0x9F`, `0x5F`, `0xDF`, `0x3F`, `0xBF`, `0x7F`, `0xFF`
835	};
836
837	} // namespace internal
838	} // namespace tbb
839
840	// Preserving access to legacy APIs
841	using tbb::internal::__TBB_load_with_acquire;
842	using tbb::internal::__TBB_store_with_release;
843
844	// Mapping historically used names to the ones expected by atomic_load_store_traits
845	#define __TBB_load_acquire __TBB_load_with_acquire
846	#define __TBB_store_release __TBB_store_with_release
847
848	#ifndef __TBB_Log2
849	inline intptr_t __TBB_Log2( uintptr_t x ) {
850	if( x==`0` ) return -`1`;
851	intptr_t result = `0`;
852
853	#if !defined(_M_ARM)
854	uintptr_t tmp;
855	if( sizeof(x)>`4` && (tmp = ((uint64_t)x)>>`32`) ) { x=tmp; result += `32`; }
856	#endif
857	if( uintptr_t tmp = x>>`16` ) { x=tmp; result += `16`; }
858	if( uintptr_t tmp = x>>`8` ) { x=tmp; result += `8`; }
859	if( uintptr_t tmp = x>>`4` ) { x=tmp; result += `4`; }
860	if( uintptr_t tmp = x>>`2` ) { x=tmp; result += `2`; }
861
862	return (x&`2`)? result+`1`: result;
863	}
864	#endif
865
866	#ifndef __TBB_AtomicOR
867	inline void __TBB_AtomicOR( volatile void *operand, uintptr_t addend ) {
868	for( tbb::internal::atomic_backoff b;;b.pause() ) {
869	uintptr_t tmp = (volatile* uintptr_t *)operand;
870	uintptr_t result = __TBB_CompareAndSwapW(operand, tmp\|addend, tmp);
871	if( result==tmp ) break;
872	}
873	}
874	#endif
875
876	#ifndef __TBB_AtomicAND
877	inline void __TBB_AtomicAND( volatile void *operand, uintptr_t addend ) {
878	for( tbb::internal::atomic_backoff b;;b.pause() ) {
879	uintptr_t tmp = (volatile* uintptr_t *)operand;
880	uintptr_t result = __TBB_CompareAndSwapW(operand, tmp&addend, tmp);
881	if( result==tmp ) break;
882	}
883	}
884	#endif
885
886	#if __TBB_PREFETCHING
887	#ifndef __TBB_cl_prefetch
888	#error This platform does not define cache management primitives required for __TBB_PREFETCHING
889	#endif
890
891	#ifndef __TBB_cl_evict
892	#define __TBB_cl_evict(p)
893	#endif
894	#endif
895
896	#ifndef __TBB_Flag
897	typedef unsigned char __TBB_Flag;
898	#endif
899	typedef __TBB_atomic __TBB_Flag __TBB_atomic_flag;
900
901	#ifndef __TBB_TryLockByte
902	inline bool __TBB_TryLockByte( __TBB_atomic_flag &flag ) {
903	return __TBB_machine_cmpswp1(&flag,`1`,`0`)==`0`;
904	}
905	#endif
906
907	#ifndef __TBB_LockByte
908	inline __TBB_Flag __TBB_LockByte( __TBB_atomic_flag& flag ) {
909	tbb::internal::atomic_backoff backoff;
910	while( !__TBB_TryLockByte(flag) ) backoff.pause();
911	return `0`;
912	}
913	#endif
914
915	#ifndef __TBB_UnlockByte
916	#define __TBB_UnlockByte(addr) __TBB_store_with_release((addr),0)
917	#endif
918
919	// lock primitives with TSX
920	#if ( __TBB_x86_32 \|\| __TBB_x86_64 ) /* only on ia32/intel64 */
921	inline void __TBB_TryLockByteElidedCancel() { __TBB_machine_try_lock_elided_cancel(); }
922
923	inline bool __TBB_TryLockByteElided( __TBB_atomic_flag& flag ) {
924	bool res = __TBB_machine_try_lock_elided( &flag )!=`0`;
925	// to avoid the "lemming" effect, we need to abort the transaction
926	// if __TBB_machine_try_lock_elided returns false (i.e., someone else
927	// has acquired the mutex non-speculatively).
928	if( !res ) __TBB_TryLockByteElidedCancel();
929	return res;
930	}
931
932	inline void __TBB_LockByteElided( __TBB_atomic_flag& flag )
933	{
934	for(;;) {
935	tbb::internal::spin_wait_while_eq( flag, `1` );
936	if( __TBB_machine_try_lock_elided( &flag ) )
937	return;
938	// Another thread acquired the lock "for real".
939	// To avoid the "lemming" effect, we abort the transaction.
940	__TBB_TryLockByteElidedCancel();
941	}
942	}
943
944	inline void __TBB_UnlockByteElided( __TBB_atomic_flag& flag ) {
945	__TBB_machine_unlock_elided( &flag );
946	}
947	#endif
948
949	#ifndef __TBB_ReverseByte
950	inline unsigned char __TBB_ReverseByte(unsigned char src) {
951	return tbb::internal::reverse<unsigned char>::byte_table[src];
952	}
953	#endif
954
955	template<typename T>
956	T __TBB_ReverseBits(T src) {
957	T dst;
958	unsigned char original = (unsigned* char *) &src;
959	unsigned char reversed = (unsigned* char *) &dst;
960
961	for( int i = sizeof(T)-`1`; i >= `0`; i-- )
962	reversed[i] = __TBB_ReverseByte( original[sizeof(T)-i-`1`] );
963
964	return dst;
965	}
966
967	#endif /* __TBB_machine_H */
968

Browse the source code of include/tbb/tbb_machine.h