1/*-------------------------------------------------------------------------
2 *
3 * arch-x86.h
4 * Atomic operations considerations specific to intel x86
5 *
6 * Note that we actually require a 486 upwards because the 386 doesn't have
7 * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
8 * anymore that's not much of a restriction luckily.
9 *
10 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
11 * Portions Copyright (c) 1994, Regents of the University of California
12 *
13 * NOTES:
14 *
15 * src/include/port/atomics/arch-x86.h
16 *
17 *-------------------------------------------------------------------------
18 */
19
20/*
21 * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
22 * or stores to be reordered with other stores, but a load can be performed
23 * before a subsequent store.
24 *
25 * Technically, some x86-ish chips support uncached memory access and/or
26 * special instructions that are weakly ordered. In those cases we'd need
27 * the read and write barriers to be lfence and sfence. But since we don't
28 * do those things, a compiler barrier should be enough.
29 *
30 * "lock; addl" has worked for longer than "mfence". It's also rumored to be
31 * faster in many scenarios.
32 */
33
34#if defined(__GNUC__) || defined(__INTEL_COMPILER)
35#if defined(__i386__) || defined(__i386)
36#define pg_memory_barrier_impl() \
37 __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
38#elif defined(__x86_64__)
39#define pg_memory_barrier_impl() \
40 __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
41#endif
42#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
43
44#define pg_read_barrier_impl() pg_compiler_barrier_impl()
45#define pg_write_barrier_impl() pg_compiler_barrier_impl()
46
47/*
48 * Provide implementation for atomics using inline assembly on x86 gcc. It's
49 * nice to support older gcc's and the compare/exchange implementation here is
50 * actually more efficient than the * __sync variant.
51 */
52#if defined(HAVE_ATOMICS)
53
54#if defined(__GNUC__) || defined(__INTEL_COMPILER)
55
56#define PG_HAVE_ATOMIC_FLAG_SUPPORT
57typedef struct pg_atomic_flag
58{
59 volatile char value;
60} pg_atomic_flag;
61
62#define PG_HAVE_ATOMIC_U32_SUPPORT
63typedef struct pg_atomic_uint32
64{
65 volatile uint32 value;
66} pg_atomic_uint32;
67
68/*
69 * It's too complicated to write inline asm for 64bit types on 32bit and the
70 * 486 can't do it anyway.
71 */
72#ifdef __x86_64__
73#define PG_HAVE_ATOMIC_U64_SUPPORT
74typedef struct pg_atomic_uint64
75{
76 /* alignment guaranteed due to being on a 64bit platform */
77 volatile uint64 value;
78} pg_atomic_uint64;
79#endif /* __x86_64__ */
80
81#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
82
83#endif /* defined(HAVE_ATOMICS) */
84
85#if !defined(PG_HAVE_SPIN_DELAY)
86/*
87 * This sequence is equivalent to the PAUSE instruction ("rep" is
88 * ignored by old IA32 processors if the following instruction is
89 * not a string operation); the IA-32 Architecture Software
90 * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
91 * PAUSE in the inner loop of a spin lock is necessary for good
92 * performance:
93 *
94 * The PAUSE instruction improves the performance of IA-32
95 * processors supporting Hyper-Threading Technology when
96 * executing spin-wait loops and other routines where one
97 * thread is accessing a shared lock or semaphore in a tight
98 * polling loop. When executing a spin-wait loop, the
99 * processor can suffer a severe performance penalty when
100 * exiting the loop because it detects a possible memory order
101 * violation and flushes the core processor's pipeline. The
102 * PAUSE instruction provides a hint to the processor that the
103 * code sequence is a spin-wait loop. The processor uses this
104 * hint to avoid the memory order violation and prevent the
105 * pipeline flush. In addition, the PAUSE instruction
106 * de-pipelines the spin-wait loop to prevent it from
107 * consuming execution resources excessively.
108 */
109#if defined(__GNUC__) || defined(__INTEL_COMPILER)
110#define PG_HAVE_SPIN_DELAY
111static __inline__ void
112pg_spin_delay_impl(void)
113{
114 __asm__ __volatile__(" rep; nop \n");
115}
116#elif defined(_MSC_VER) && defined(__x86_64__)
117#define PG_HAVE_SPIN_DELAY
118static __forceinline void
119pg_spin_delay_impl(void)
120{
121 _mm_pause();
122}
123#elif defined(_MSC_VER)
124#define PG_HAVE_SPIN_DELAY
125static __forceinline void
126pg_spin_delay_impl(void)
127{
128 /* See comment for gcc code. Same code, MASM syntax */
129 __asm rep nop;
130}
131#endif
132#endif /* !defined(PG_HAVE_SPIN_DELAY) */
133
134
135#if defined(HAVE_ATOMICS)
136
137#if defined(__GNUC__) || defined(__INTEL_COMPILER)
138
139#define PG_HAVE_ATOMIC_TEST_SET_FLAG
140static inline bool
141pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
142{
143 register char _res = 1;
144
145 __asm__ __volatile__(
146 " lock \n"
147 " xchgb %0,%1 \n"
148: "+q"(_res), "+m"(ptr->value)
149:
150: "memory");
151 return _res == 0;
152}
153
154#define PG_HAVE_ATOMIC_CLEAR_FLAG
155static inline void
156pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
157{
158 /*
159 * On a TSO architecture like x86 it's sufficient to use a compiler
160 * barrier to achieve release semantics.
161 */
162 __asm__ __volatile__("" ::: "memory");
163 ptr->value = 0;
164}
165
166#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
167static inline bool
168pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
169 uint32 *expected, uint32 newval)
170{
171 char ret;
172
173 /*
174 * Perform cmpxchg and use the zero flag which it implicitly sets when
175 * equal to measure the success.
176 */
177 __asm__ __volatile__(
178 " lock \n"
179 " cmpxchgl %4,%5 \n"
180 " setz %2 \n"
181: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
182: "a" (*expected), "r" (newval), "m"(ptr->value)
183: "memory", "cc");
184 return (bool) ret;
185}
186
187#define PG_HAVE_ATOMIC_FETCH_ADD_U32
188static inline uint32
189pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
190{
191 uint32 res;
192 __asm__ __volatile__(
193 " lock \n"
194 " xaddl %0,%1 \n"
195: "=q"(res), "=m"(ptr->value)
196: "0" (add_), "m"(ptr->value)
197: "memory", "cc");
198 return res;
199}
200
201#ifdef __x86_64__
202
203#define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
204static inline bool
205pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
206 uint64 *expected, uint64 newval)
207{
208 char ret;
209
210 /*
211 * Perform cmpxchg and use the zero flag which it implicitly sets when
212 * equal to measure the success.
213 */
214 __asm__ __volatile__(
215 " lock \n"
216 " cmpxchgq %4,%5 \n"
217 " setz %2 \n"
218: "=a" (*expected), "=m"(ptr->value), "=q" (ret)
219: "a" (*expected), "r" (newval), "m"(ptr->value)
220: "memory", "cc");
221 return (bool) ret;
222}
223
224#define PG_HAVE_ATOMIC_FETCH_ADD_U64
225static inline uint64
226pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
227{
228 uint64 res;
229 __asm__ __volatile__(
230 " lock \n"
231 " xaddq %0,%1 \n"
232: "=q"(res), "=m"(ptr->value)
233: "0" (add_), "m"(ptr->value)
234: "memory", "cc");
235 return res;
236}
237
238#endif /* __x86_64__ */
239
240#endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
241
242/*
243 * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
244 * since at least the 586. As well as on all x86-64 cpus.
245 */
246#if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \
247 (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
248 defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
249#define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
250#endif /* 8 byte single-copy atomicity */
251
252#endif /* HAVE_ATOMICS */
253