| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * arch-x86.h |
| 4 | * Atomic operations considerations specific to intel x86 |
| 5 | * |
| 6 | * Note that we actually require a 486 upwards because the 386 doesn't have |
| 7 | * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere |
| 8 | * anymore that's not much of a restriction luckily. |
| 9 | * |
| 10 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 11 | * Portions Copyright (c) 1994, Regents of the University of California |
| 12 | * |
| 13 | * NOTES: |
| 14 | * |
| 15 | * src/include/port/atomics/arch-x86.h |
| 16 | * |
| 17 | *------------------------------------------------------------------------- |
| 18 | */ |
| 19 | |
| 20 | /* |
| 21 | * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads, |
| 22 | * or stores to be reordered with other stores, but a load can be performed |
| 23 | * before a subsequent store. |
| 24 | * |
| 25 | * Technically, some x86-ish chips support uncached memory access and/or |
| 26 | * special instructions that are weakly ordered. In those cases we'd need |
| 27 | * the read and write barriers to be lfence and sfence. But since we don't |
| 28 | * do those things, a compiler barrier should be enough. |
| 29 | * |
| 30 | * "lock; addl" has worked for longer than "mfence". It's also rumored to be |
| 31 | * faster in many scenarios. |
| 32 | */ |
| 33 | |
| 34 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
| 35 | #if defined(__i386__) || defined(__i386) |
| 36 | #define pg_memory_barrier_impl() \ |
| 37 | __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc") |
| 38 | #elif defined(__x86_64__) |
| 39 | #define pg_memory_barrier_impl() \ |
| 40 | __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc") |
| 41 | #endif |
| 42 | #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ |
| 43 | |
| 44 | #define pg_read_barrier_impl() pg_compiler_barrier_impl() |
| 45 | #define pg_write_barrier_impl() pg_compiler_barrier_impl() |
| 46 | |
| 47 | /* |
| 48 | * Provide implementation for atomics using inline assembly on x86 gcc. It's |
| 49 | * nice to support older gcc's and the compare/exchange implementation here is |
| 50 | * actually more efficient than the * __sync variant. |
| 51 | */ |
| 52 | #if defined(HAVE_ATOMICS) |
| 53 | |
| 54 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
| 55 | |
| 56 | #define PG_HAVE_ATOMIC_FLAG_SUPPORT |
| 57 | typedef struct pg_atomic_flag |
| 58 | { |
| 59 | volatile char value; |
| 60 | } pg_atomic_flag; |
| 61 | |
| 62 | #define PG_HAVE_ATOMIC_U32_SUPPORT |
| 63 | typedef struct pg_atomic_uint32 |
| 64 | { |
| 65 | volatile uint32 value; |
| 66 | } pg_atomic_uint32; |
| 67 | |
| 68 | /* |
| 69 | * It's too complicated to write inline asm for 64bit types on 32bit and the |
| 70 | * 486 can't do it anyway. |
| 71 | */ |
| 72 | #ifdef __x86_64__ |
| 73 | #define PG_HAVE_ATOMIC_U64_SUPPORT |
| 74 | typedef struct pg_atomic_uint64 |
| 75 | { |
| 76 | /* alignment guaranteed due to being on a 64bit platform */ |
| 77 | volatile uint64 value; |
| 78 | } pg_atomic_uint64; |
| 79 | #endif /* __x86_64__ */ |
| 80 | |
| 81 | #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ |
| 82 | |
| 83 | #endif /* defined(HAVE_ATOMICS) */ |
| 84 | |
| 85 | #if !defined(PG_HAVE_SPIN_DELAY) |
| 86 | /* |
| 87 | * This sequence is equivalent to the PAUSE instruction ("rep" is |
| 88 | * ignored by old IA32 processors if the following instruction is |
| 89 | * not a string operation); the IA-32 Architecture Software |
| 90 | * Developer's Manual, Vol. 3, Section 7.7.2 describes why using |
| 91 | * PAUSE in the inner loop of a spin lock is necessary for good |
| 92 | * performance: |
| 93 | * |
| 94 | * The PAUSE instruction improves the performance of IA-32 |
| 95 | * processors supporting Hyper-Threading Technology when |
| 96 | * executing spin-wait loops and other routines where one |
| 97 | * thread is accessing a shared lock or semaphore in a tight |
| 98 | * polling loop. When executing a spin-wait loop, the |
| 99 | * processor can suffer a severe performance penalty when |
| 100 | * exiting the loop because it detects a possible memory order |
| 101 | * violation and flushes the core processor's pipeline. The |
| 102 | * PAUSE instruction provides a hint to the processor that the |
| 103 | * code sequence is a spin-wait loop. The processor uses this |
| 104 | * hint to avoid the memory order violation and prevent the |
| 105 | * pipeline flush. In addition, the PAUSE instruction |
| 106 | * de-pipelines the spin-wait loop to prevent it from |
| 107 | * consuming execution resources excessively. |
| 108 | */ |
| 109 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
| 110 | #define PG_HAVE_SPIN_DELAY |
| 111 | static __inline__ void |
| 112 | pg_spin_delay_impl(void) |
| 113 | { |
| 114 | __asm__ __volatile__(" rep; nop \n" ); |
| 115 | } |
| 116 | #elif defined(_MSC_VER) && defined(__x86_64__) |
| 117 | #define PG_HAVE_SPIN_DELAY |
| 118 | static __forceinline void |
| 119 | pg_spin_delay_impl(void) |
| 120 | { |
| 121 | _mm_pause(); |
| 122 | } |
| 123 | #elif defined(_MSC_VER) |
| 124 | #define PG_HAVE_SPIN_DELAY |
| 125 | static __forceinline void |
| 126 | pg_spin_delay_impl(void) |
| 127 | { |
| 128 | /* See comment for gcc code. Same code, MASM syntax */ |
| 129 | __asm rep nop; |
| 130 | } |
| 131 | #endif |
| 132 | #endif /* !defined(PG_HAVE_SPIN_DELAY) */ |
| 133 | |
| 134 | |
| 135 | #if defined(HAVE_ATOMICS) |
| 136 | |
| 137 | #if defined(__GNUC__) || defined(__INTEL_COMPILER) |
| 138 | |
| 139 | #define PG_HAVE_ATOMIC_TEST_SET_FLAG |
| 140 | static inline bool |
| 141 | pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr) |
| 142 | { |
| 143 | register char _res = 1; |
| 144 | |
| 145 | __asm__ __volatile__( |
| 146 | " lock \n" |
| 147 | " xchgb %0,%1 \n" |
| 148 | : "+q" (_res), "+m" (ptr->value) |
| 149 | : |
| 150 | : "memory" ); |
| 151 | return _res == 0; |
| 152 | } |
| 153 | |
| 154 | #define PG_HAVE_ATOMIC_CLEAR_FLAG |
| 155 | static inline void |
| 156 | pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr) |
| 157 | { |
| 158 | /* |
| 159 | * On a TSO architecture like x86 it's sufficient to use a compiler |
| 160 | * barrier to achieve release semantics. |
| 161 | */ |
| 162 | __asm__ __volatile__("" ::: "memory" ); |
| 163 | ptr->value = 0; |
| 164 | } |
| 165 | |
| 166 | #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32 |
| 167 | static inline bool |
| 168 | pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr, |
| 169 | uint32 *expected, uint32 newval) |
| 170 | { |
| 171 | char ret; |
| 172 | |
| 173 | /* |
| 174 | * Perform cmpxchg and use the zero flag which it implicitly sets when |
| 175 | * equal to measure the success. |
| 176 | */ |
| 177 | __asm__ __volatile__( |
| 178 | " lock \n" |
| 179 | " cmpxchgl %4,%5 \n" |
| 180 | " setz %2 \n" |
| 181 | : "=a" (*expected), "=m" (ptr->value), "=q" (ret) |
| 182 | : "a" (*expected), "r" (newval), "m" (ptr->value) |
| 183 | : "memory" , "cc" ); |
| 184 | return (bool) ret; |
| 185 | } |
| 186 | |
| 187 | #define PG_HAVE_ATOMIC_FETCH_ADD_U32 |
| 188 | static inline uint32 |
| 189 | pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_) |
| 190 | { |
| 191 | uint32 res; |
| 192 | __asm__ __volatile__( |
| 193 | " lock \n" |
| 194 | " xaddl %0,%1 \n" |
| 195 | : "=q" (res), "=m" (ptr->value) |
| 196 | : "0" (add_), "m" (ptr->value) |
| 197 | : "memory" , "cc" ); |
| 198 | return res; |
| 199 | } |
| 200 | |
| 201 | #ifdef __x86_64__ |
| 202 | |
| 203 | #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64 |
| 204 | static inline bool |
| 205 | pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr, |
| 206 | uint64 *expected, uint64 newval) |
| 207 | { |
| 208 | char ret; |
| 209 | |
| 210 | /* |
| 211 | * Perform cmpxchg and use the zero flag which it implicitly sets when |
| 212 | * equal to measure the success. |
| 213 | */ |
| 214 | __asm__ __volatile__( |
| 215 | " lock \n" |
| 216 | " cmpxchgq %4,%5 \n" |
| 217 | " setz %2 \n" |
| 218 | : "=a" (*expected), "=m" (ptr->value), "=q" (ret) |
| 219 | : "a" (*expected), "r" (newval), "m" (ptr->value) |
| 220 | : "memory" , "cc" ); |
| 221 | return (bool) ret; |
| 222 | } |
| 223 | |
| 224 | #define PG_HAVE_ATOMIC_FETCH_ADD_U64 |
| 225 | static inline uint64 |
| 226 | pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_) |
| 227 | { |
| 228 | uint64 res; |
| 229 | __asm__ __volatile__( |
| 230 | " lock \n" |
| 231 | " xaddq %0,%1 \n" |
| 232 | : "=q" (res), "=m" (ptr->value) |
| 233 | : "0" (add_), "m" (ptr->value) |
| 234 | : "memory" , "cc" ); |
| 235 | return res; |
| 236 | } |
| 237 | |
| 238 | #endif /* __x86_64__ */ |
| 239 | |
| 240 | #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */ |
| 241 | |
| 242 | /* |
| 243 | * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms |
| 244 | * since at least the 586. As well as on all x86-64 cpus. |
| 245 | */ |
| 246 | #if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \ |
| 247 | (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \ |
| 248 | defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */ |
| 249 | #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY |
| 250 | #endif /* 8 byte single-copy atomicity */ |
| 251 | |
| 252 | #endif /* HAVE_ATOMICS */ |
| 253 | |