1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * arch-x86.h |
4 | * Atomic operations considerations specific to intel x86 |
5 | * |
6 | * Note that we actually require a 486 upwards because the 386 doesn't have |
7 | * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere |
8 | * anymore that's not much of a restriction luckily. |
9 | * |
10 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
11 | * Portions Copyright (c) 1994, Regents of the University of California |
12 | * |
13 | * NOTES: |
14 | * |
15 | * src/include/port/atomics/arch-x86.h |
16 | * |
17 | *------------------------------------------------------------------------- |
18 | */ |
19 | |
20 | /* |
21 | * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads, |
22 | * or stores to be reordered with other stores, but a load can be performed |
23 | * before a subsequent store. |
24 | * |
25 | * Technically, some x86-ish chips support uncached memory access and/or |
26 | * special instructions that are weakly ordered. In those cases we'd need |
27 | * the read and write barriers to be lfence and sfence. But since we don't |
28 | * do those things, a compiler barrier should be enough. |
29 | * |
30 | * "lock; addl" has worked for longer than "mfence". It's also rumored to be |
31 | * faster in many scenarios. |
32 | */ |
33 | |
34 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
35 | #if defined(__i386__) || defined(__i386) |
36 | #define pg_memory_barrier_impl() \ |
37 | __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc") |
38 | #elif defined(__x86_64__) |
39 | #define pg_memory_barrier_impl() \ |
40 | __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc") |
41 | #endif |
42 | #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ |
43 | |
44 | #define pg_read_barrier_impl() pg_compiler_barrier_impl() |
45 | #define pg_write_barrier_impl() pg_compiler_barrier_impl() |
46 | |
47 | /* |
48 | * Provide implementation for atomics using inline assembly on x86 gcc. It's |
49 | * nice to support older gcc's and the compare/exchange implementation here is |
50 | * actually more efficient than the * __sync variant. |
51 | */ |
52 | #if defined(HAVE_ATOMICS) |
53 | |
54 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
55 | |
56 | #define PG_HAVE_ATOMIC_FLAG_SUPPORT |
57 | typedef struct pg_atomic_flag |
58 | { |
59 | volatile char value; |
60 | } pg_atomic_flag; |
61 | |
62 | #define PG_HAVE_ATOMIC_U32_SUPPORT |
63 | typedef struct pg_atomic_uint32 |
64 | { |
65 | volatile uint32 value; |
66 | } pg_atomic_uint32; |
67 | |
68 | /* |
69 | * It's too complicated to write inline asm for 64bit types on 32bit and the |
70 | * 486 can't do it anyway. |
71 | */ |
72 | #ifdef __x86_64__ |
73 | #define PG_HAVE_ATOMIC_U64_SUPPORT |
74 | typedef struct pg_atomic_uint64 |
75 | { |
76 | /* alignment guaranteed due to being on a 64bit platform */ |
77 | volatile uint64 value; |
78 | } pg_atomic_uint64; |
79 | #endif /* __x86_64__ */ |
80 | |
81 | #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ |
82 | |
83 | #endif /* defined(HAVE_ATOMICS) */ |
84 | |
85 | #if !defined(PG_HAVE_SPIN_DELAY) |
86 | /* |
87 | * This sequence is equivalent to the PAUSE instruction ("rep" is |
88 | * ignored by old IA32 processors if the following instruction is |
89 | * not a string operation); the IA-32 Architecture Software |
90 | * Developer's Manual, Vol. 3, Section 7.7.2 describes why using |
91 | * PAUSE in the inner loop of a spin lock is necessary for good |
92 | * performance: |
93 | * |
94 | * The PAUSE instruction improves the performance of IA-32 |
95 | * processors supporting Hyper-Threading Technology when |
96 | * executing spin-wait loops and other routines where one |
97 | * thread is accessing a shared lock or semaphore in a tight |
98 | * polling loop. When executing a spin-wait loop, the |
99 | * processor can suffer a severe performance penalty when |
100 | * exiting the loop because it detects a possible memory order |
101 | * violation and flushes the core processor's pipeline. The |
102 | * PAUSE instruction provides a hint to the processor that the |
103 | * code sequence is a spin-wait loop. The processor uses this |
104 | * hint to avoid the memory order violation and prevent the |
105 | * pipeline flush. In addition, the PAUSE instruction |
106 | * de-pipelines the spin-wait loop to prevent it from |
107 | * consuming execution resources excessively. |
108 | */ |
109 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
110 | #define PG_HAVE_SPIN_DELAY |
111 | static __inline__ void |
112 | pg_spin_delay_impl(void) |
113 | { |
114 | __asm__ __volatile__(" rep; nop \n" ); |
115 | } |
116 | #elif defined(_MSC_VER) && defined(__x86_64__) |
117 | #define PG_HAVE_SPIN_DELAY |
118 | static __forceinline void |
119 | pg_spin_delay_impl(void) |
120 | { |
121 | _mm_pause(); |
122 | } |
123 | #elif defined(_MSC_VER) |
124 | #define PG_HAVE_SPIN_DELAY |
125 | static __forceinline void |
126 | pg_spin_delay_impl(void) |
127 | { |
128 | /* See comment for gcc code. Same code, MASM syntax */ |
129 | __asm rep nop; |
130 | } |
131 | #endif |
132 | #endif /* !defined(PG_HAVE_SPIN_DELAY) */ |
133 | |
134 | |
135 | #if defined(HAVE_ATOMICS) |
136 | |
137 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
138 | |
139 | #define PG_HAVE_ATOMIC_TEST_SET_FLAG |
140 | static inline bool |
141 | pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr) |
142 | { |
143 | register char _res = 1; |
144 | |
145 | __asm__ __volatile__( |
146 | " lock \n" |
147 | " xchgb %0,%1 \n" |
148 | : "+q" (_res), "+m" (ptr->value) |
149 | : |
150 | : "memory" ); |
151 | return _res == 0; |
152 | } |
153 | |
154 | #define PG_HAVE_ATOMIC_CLEAR_FLAG |
155 | static inline void |
156 | pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr) |
157 | { |
158 | /* |
159 | * On a TSO architecture like x86 it's sufficient to use a compiler |
160 | * barrier to achieve release semantics. |
161 | */ |
162 | __asm__ __volatile__("" ::: "memory" ); |
163 | ptr->value = 0; |
164 | } |
165 | |
166 | #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32 |
167 | static inline bool |
168 | pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, |
169 | uint32 *expected, uint32 newval) |
170 | { |
171 | char ret; |
172 | |
173 | /* |
174 | * Perform cmpxchg and use the zero flag which it implicitly sets when |
175 | * equal to measure the success. |
176 | */ |
177 | __asm__ __volatile__( |
178 | " lock \n" |
179 | " cmpxchgl %4,%5 \n" |
180 | " setz %2 \n" |
181 | : "=a" (*expected), "=m" (ptr->value), "=q" (ret) |
182 | : "a" (*expected), "r" (newval), "m" (ptr->value) |
183 | : "memory" , "cc" ); |
184 | return (bool) ret; |
185 | } |
186 | |
187 | #define PG_HAVE_ATOMIC_FETCH_ADD_U32 |
188 | static inline uint32 |
189 | pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_) |
190 | { |
191 | uint32 res; |
192 | __asm__ __volatile__( |
193 | " lock \n" |
194 | " xaddl %0,%1 \n" |
195 | : "=q" (res), "=m" (ptr->value) |
196 | : "0" (add_), "m" (ptr->value) |
197 | : "memory" , "cc" ); |
198 | return res; |
199 | } |
200 | |
201 | #ifdef __x86_64__ |
202 | |
203 | #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64 |
204 | static inline bool |
205 | pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, |
206 | uint64 *expected, uint64 newval) |
207 | { |
208 | char ret; |
209 | |
210 | /* |
211 | * Perform cmpxchg and use the zero flag which it implicitly sets when |
212 | * equal to measure the success. |
213 | */ |
214 | __asm__ __volatile__( |
215 | " lock \n" |
216 | " cmpxchgq %4,%5 \n" |
217 | " setz %2 \n" |
218 | : "=a" (*expected), "=m" (ptr->value), "=q" (ret) |
219 | : "a" (*expected), "r" (newval), "m" (ptr->value) |
220 | : "memory" , "cc" ); |
221 | return (bool) ret; |
222 | } |
223 | |
224 | #define PG_HAVE_ATOMIC_FETCH_ADD_U64 |
225 | static inline uint64 |
226 | pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_) |
227 | { |
228 | uint64 res; |
229 | __asm__ __volatile__( |
230 | " lock \n" |
231 | " xaddq %0,%1 \n" |
232 | : "=q" (res), "=m" (ptr->value) |
233 | : "0" (add_), "m" (ptr->value) |
234 | : "memory" , "cc" ); |
235 | return res; |
236 | } |
237 | |
238 | #endif /* __x86_64__ */ |
239 | |
240 | #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ |
241 | |
242 | /* |
243 | * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms |
244 | * since at least the 586. As well as on all x86-64 cpus. |
245 | */ |
246 | #if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \ |
247 | (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \ |
248 | defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */ |
249 | #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY |
250 | #endif /* 8 byte single-copy atomicity */ |
251 | |
252 | #endif /* HAVE_ATOMICS */ |
253 | |