1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <algorithm> |
20 | #include <array> |
21 | #include <atomic> |
22 | #include <cassert> |
23 | #include <functional> |
24 | #include <limits> |
25 | #include <mutex> |
26 | #include <string> |
27 | #include <type_traits> |
28 | #include <unordered_map> |
29 | #include <vector> |
30 | |
31 | #include <folly/Indestructible.h> |
32 | #include <folly/Likely.h> |
33 | #include <folly/Memory.h> |
34 | #include <folly/Portability.h> |
35 | #include <folly/hash/Hash.h> |
36 | #include <folly/lang/Align.h> |
37 | #include <folly/lang/Exception.h> |
38 | #include <folly/system/ThreadId.h> |
39 | |
40 | namespace folly { |
41 | |
42 | // This file contains several classes that might be useful if you are |
43 | // trying to dynamically optimize cache locality: CacheLocality reads |
44 | // cache sharing information from sysfs to determine how CPUs should be |
45 | // grouped to minimize contention, Getcpu provides fast access to the |
46 | // current CPU via __vdso_getcpu, and AccessSpreader uses these two to |
47 | // optimally spread accesses among a predetermined number of stripes. |
48 | // |
49 | // AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is |
50 | // substantially less than the cost of a cache miss. This means that we |
51 | // can effectively use it to reduce cache line ping-pong on striped data |
52 | // structures such as IndexedMemPool or statistics counters. |
53 | // |
54 | // Because CacheLocality looks at all of the cache levels, it can be |
55 | // used for different levels of optimization. AccessSpreader(2) does |
56 | // per-chip spreading on a dual socket system. AccessSpreader(numCpus) |
57 | // does perfect per-cpu spreading. AccessSpreader(numCpus / 2) does |
58 | // perfect L1 spreading in a system with hyperthreading enabled. |
59 | |
60 | struct CacheLocality { |
61 | /// 1 more than the maximum value that can be returned from sched_getcpu |
62 | /// or getcpu. This is the number of hardware thread contexts provided |
63 | /// by the processors |
64 | size_t numCpus; |
65 | |
66 | /// Holds the number of caches present at each cache level (0 is |
67 | /// the closest to the cpu). This is the number of AccessSpreader |
68 | /// stripes needed to avoid cross-cache communication at the specified |
69 | /// layer. numCachesByLevel.front() is the number of L1 caches and |
70 | /// numCachesByLevel.back() is the number of last-level caches. |
71 | std::vector<size_t> numCachesByLevel; |
72 | |
73 | /// A map from cpu (from sched_getcpu or getcpu) to an index in the |
74 | /// range 0..numCpus-1, where neighboring locality indices are more |
75 | /// likely to share caches then indices far away. All of the members |
76 | /// of a particular cache level be contiguous in their locality index. |
77 | /// For example, if numCpus is 32 and numCachesByLevel.back() is 2, |
78 | /// then cpus with a locality index < 16 will share one last-level |
79 | /// cache and cpus with a locality index >= 16 will share the other. |
80 | std::vector<size_t> localityIndexByCpu; |
81 | |
82 | /// Returns the best CacheLocality information available for the current |
83 | /// system, cached for fast access. This will be loaded from sysfs if |
84 | /// possible, otherwise it will be correct in the number of CPUs but |
85 | /// not in their sharing structure. |
86 | /// |
87 | /// If you are into yo dawgs, this is a shared cache of the local |
88 | /// locality of the shared caches. |
89 | /// |
90 | /// The template parameter here is used to allow injection of a |
91 | /// repeatable CacheLocality structure during testing. Rather than |
92 | /// inject the type of the CacheLocality provider into every data type |
93 | /// that transitively uses it, all components select between the default |
94 | /// sysfs implementation and a deterministic implementation by keying |
95 | /// off the type of the underlying atomic. See DeterministicScheduler. |
96 | template <template <typename> class Atom = std::atomic> |
97 | static const CacheLocality& system(); |
98 | |
99 | /// Reads CacheLocality information from a tree structured like |
100 | /// the sysfs filesystem. The provided function will be evaluated |
101 | /// for each sysfs file that needs to be queried. The function |
102 | /// should return a string containing the first line of the file |
103 | /// (not including the newline), or an empty string if the file does |
104 | /// not exist. The function will be called with paths of the form |
105 | /// /sys/devices/system/cpu/cpu*/cache/index*/{type,shared_cpu_list} . |
106 | /// Throws an exception if no caches can be parsed at all. |
107 | static CacheLocality readFromSysfsTree( |
108 | const std::function<std::string(std::string)>& mapping); |
109 | |
110 | /// Reads CacheLocality information from the real sysfs filesystem. |
111 | /// Throws an exception if no cache information can be loaded. |
112 | static CacheLocality readFromSysfs(); |
113 | |
114 | /// Returns a usable (but probably not reflective of reality) |
115 | /// CacheLocality structure with the specified number of cpus and a |
116 | /// single cache level that associates one cpu per cache. |
117 | static CacheLocality uniform(size_t numCpus); |
118 | }; |
119 | |
120 | /// Knows how to derive a function pointer to the VDSO implementation of |
121 | /// getcpu(2), if available |
122 | struct Getcpu { |
123 | /// Function pointer to a function with the same signature as getcpu(2). |
124 | typedef int (*Func)(unsigned* cpu, unsigned* node, void* unused); |
125 | |
126 | /// Returns a pointer to the VDSO implementation of getcpu(2), if |
127 | /// available, or nullptr otherwise. This function may be quite |
128 | /// expensive, be sure to cache the result. |
129 | static Func resolveVdsoFunc(); |
130 | }; |
131 | |
132 | #ifdef FOLLY_TLS |
133 | template <template <typename> class Atom> |
134 | struct SequentialThreadId { |
135 | /// Returns the thread id assigned to the current thread |
136 | static unsigned get() { |
137 | auto rv = currentId; |
138 | if (UNLIKELY(rv == 0)) { |
139 | rv = currentId = ++prevId; |
140 | } |
141 | return rv; |
142 | } |
143 | |
144 | private: |
145 | static Atom<unsigned> prevId; |
146 | |
147 | static FOLLY_TLS unsigned currentId; |
148 | }; |
149 | |
150 | template <template <typename> class Atom> |
151 | Atom<unsigned> SequentialThreadId<Atom>::prevId(0); |
152 | |
153 | template <template <typename> class Atom> |
154 | FOLLY_TLS unsigned SequentialThreadId<Atom>::currentId(0); |
155 | |
156 | // Suppress this instantiation in other translation units. It is |
157 | // instantiated in CacheLocality.cpp |
158 | extern template struct SequentialThreadId<std::atomic>; |
159 | #endif |
160 | |
161 | struct HashingThreadId { |
162 | static unsigned get() { |
163 | return hash::twang_32from64(getCurrentThreadID()); |
164 | } |
165 | }; |
166 | |
167 | /// A class that lazily binds a unique (for each implementation of Atom) |
168 | /// identifier to a thread. This is a fallback mechanism for the access |
169 | /// spreader if __vdso_getcpu can't be loaded |
170 | template <typename ThreadId> |
171 | struct FallbackGetcpu { |
172 | /// Fills the thread id into the cpu and node out params (if they |
173 | /// are non-null). This method is intended to act like getcpu when a |
174 | /// fast-enough form of getcpu isn't available or isn't desired |
175 | static int getcpu(unsigned* cpu, unsigned* node, void* /* unused */) { |
176 | auto id = ThreadId::get(); |
177 | if (cpu) { |
178 | *cpu = id; |
179 | } |
180 | if (node) { |
181 | *node = id; |
182 | } |
183 | return 0; |
184 | } |
185 | }; |
186 | |
187 | #ifdef FOLLY_TLS |
188 | typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType; |
189 | #else |
190 | typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType; |
191 | #endif |
192 | |
193 | /// AccessSpreader arranges access to a striped data structure in such a |
194 | /// way that concurrently executing threads are likely to be accessing |
195 | /// different stripes. It does NOT guarantee uncontended access. |
196 | /// Your underlying algorithm must be thread-safe without spreading, this |
197 | /// is merely an optimization. AccessSpreader::current(n) is typically |
198 | /// much faster than a cache miss (12 nanos on my dev box, tested fast |
199 | /// in both 2.6 and 3.2 kernels). |
200 | /// |
201 | /// If available (and not using the deterministic testing implementation) |
202 | /// AccessSpreader uses the getcpu system call via VDSO and the |
203 | /// precise locality information retrieved from sysfs by CacheLocality. |
204 | /// This provides optimal anti-sharing at a fraction of the cost of a |
205 | /// cache miss. |
206 | /// |
207 | /// When there are not as many stripes as processors, we try to optimally |
208 | /// place the cache sharing boundaries. This means that if you have 2 |
209 | /// stripes and run on a dual-socket system, your 2 stripes will each get |
210 | /// all of the cores from a single socket. If you have 16 stripes on a |
211 | /// 16 core system plus hyperthreading (32 cpus), each core will get its |
212 | /// own stripe and there will be no cache sharing at all. |
213 | /// |
214 | /// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be |
215 | /// loaded, or for use during deterministic testing. Using sched_getcpu |
216 | /// or the getcpu syscall would negate the performance advantages of |
217 | /// access spreading, so we use a thread-local value and a shared atomic |
218 | /// counter to spread access out. On systems lacking both a fast getcpu() |
219 | /// and TLS, we hash the thread id to spread accesses. |
220 | /// |
221 | /// AccessSpreader is templated on the template type that is used |
222 | /// to implement atomics, as a way to instantiate the underlying |
223 | /// heuristics differently for production use and deterministic unit |
224 | /// testing. See DeterministicScheduler for more. If you aren't using |
225 | /// DeterministicScheduler, you can just use the default template parameter |
226 | /// all of the time. |
227 | template <template <typename> class Atom = std::atomic> |
228 | struct AccessSpreader { |
229 | /// Returns the stripe associated with the current CPU. The returned |
230 | /// value will be < numStripes. |
231 | static size_t current(size_t numStripes) { |
232 | // widthAndCpuToStripe[0] will actually work okay (all zeros), but |
233 | // something's wrong with the caller |
234 | assert(numStripes > 0); |
235 | |
236 | unsigned cpu; |
237 | getcpuFunc(&cpu, nullptr, nullptr); |
238 | return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)] |
239 | [cpu % kMaxCpus]; |
240 | } |
241 | |
242 | #ifdef FOLLY_TLS |
243 | /// Returns the stripe associated with the current CPU. The returned |
244 | /// value will be < numStripes. |
245 | /// This function caches the current cpu in a thread-local variable for a |
246 | /// certain small number of calls, which can make the result imprecise, but |
247 | /// it is more efficient (amortized 2 ns on my dev box, compared to 12 ns for |
248 | /// current()). |
249 | static size_t cachedCurrent(size_t numStripes) { |
250 | return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)] |
251 | [cpuCache.cpu()]; |
252 | } |
253 | #else |
254 | /// Fallback implementation when thread-local storage isn't available. |
255 | static size_t cachedCurrent(size_t numStripes) { |
256 | return current(numStripes); |
257 | } |
258 | #endif |
259 | |
260 | private: |
261 | /// If there are more cpus than this nothing will crash, but there |
262 | /// might be unnecessary sharing |
263 | enum { kMaxCpus = 128 }; |
264 | |
265 | typedef uint8_t CompactStripe; |
266 | |
267 | static_assert( |
268 | (kMaxCpus & (kMaxCpus - 1)) == 0, |
269 | "kMaxCpus should be a power of two so modulo is fast" ); |
270 | static_assert( |
271 | kMaxCpus - 1 <= std::numeric_limits<CompactStripe>::max(), |
272 | "stripeByCpu element type isn't wide enough" ); |
273 | |
274 | /// Points to the getcpu-like function we are using to obtain the |
275 | /// current cpu. It should not be assumed that the returned cpu value |
276 | /// is in range. We use a static for this so that we can prearrange a |
277 | /// valid value in the pre-constructed state and avoid the need for a |
278 | /// conditional on every subsequent invocation (not normally a big win, |
279 | /// but 20% on some inner loops here). |
280 | static Getcpu::Func getcpuFunc; |
281 | |
282 | /// For each level of splitting up to kMaxCpus, maps the cpu (mod |
283 | /// kMaxCpus) to the stripe. Rather than performing any inequalities |
284 | /// or modulo on the actual number of cpus, we just fill in the entire |
285 | /// array. |
286 | static CompactStripe widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus]; |
287 | |
288 | /// Caches the current CPU and refreshes the cache every so often. |
289 | class CpuCache { |
290 | public: |
291 | unsigned cpu() { |
292 | if (UNLIKELY(cachedCpuUses_-- == 0)) { |
293 | unsigned cpu; |
294 | AccessSpreader::getcpuFunc(&cpu, nullptr, nullptr); |
295 | cachedCpu_ = cpu % kMaxCpus; |
296 | cachedCpuUses_ = kMaxCachedCpuUses - 1; |
297 | } |
298 | return cachedCpu_; |
299 | } |
300 | |
301 | private: |
302 | static constexpr unsigned kMaxCachedCpuUses = 32; |
303 | |
304 | unsigned cachedCpu_{0}; |
305 | unsigned cachedCpuUses_{0}; |
306 | }; |
307 | |
308 | #ifdef FOLLY_TLS |
309 | static FOLLY_TLS CpuCache cpuCache; |
310 | #endif |
311 | |
312 | static bool initialized; |
313 | |
314 | /// Returns the best getcpu implementation for Atom |
315 | static Getcpu::Func pickGetcpuFunc() { |
316 | auto best = Getcpu::resolveVdsoFunc(); |
317 | return best ? best : &FallbackGetcpuType::getcpu; |
318 | } |
319 | |
320 | /// Always claims to be on CPU zero, node zero |
321 | static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*) { |
322 | if (cpu != nullptr) { |
323 | *cpu = 0; |
324 | } |
325 | if (node != nullptr) { |
326 | *node = 0; |
327 | } |
328 | return 0; |
329 | } |
330 | |
331 | // The function to call for fast lookup of getcpu is a singleton, as |
332 | // is the precomputed table of locality information. AccessSpreader |
333 | // is used in very tight loops, however (we're trying to race an L1 |
334 | // cache miss!), so the normal singleton mechanisms are noticeably |
335 | // expensive. Even a not-taken branch guarding access to getcpuFunc |
336 | // slows AccessSpreader::current from 12 nanos to 14. As a result, we |
337 | // populate the static members with simple (but valid) values that can |
338 | // be filled in by the linker, and then follow up with a normal static |
339 | // initializer call that puts in the proper version. This means that |
340 | // when there are initialization order issues we will just observe a |
341 | // zero stripe. Once a sanitizer gets smart enough to detect this as |
342 | // a race or undefined behavior, we can annotate it. |
343 | |
344 | static bool initialize() { |
345 | getcpuFunc = pickGetcpuFunc(); |
346 | |
347 | auto& cacheLocality = CacheLocality::system<Atom>(); |
348 | auto n = cacheLocality.numCpus; |
349 | for (size_t width = 0; width <= kMaxCpus; ++width) { |
350 | auto numStripes = std::max(size_t{1}, width); |
351 | for (size_t cpu = 0; cpu < kMaxCpus && cpu < n; ++cpu) { |
352 | auto index = cacheLocality.localityIndexByCpu[cpu]; |
353 | assert(index < n); |
354 | // as index goes from 0..n, post-transform value goes from |
355 | // 0..numStripes |
356 | widthAndCpuToStripe[width][cpu] = |
357 | CompactStripe((index * numStripes) / n); |
358 | assert(widthAndCpuToStripe[width][cpu] < numStripes); |
359 | } |
360 | for (size_t cpu = n; cpu < kMaxCpus; ++cpu) { |
361 | widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n]; |
362 | } |
363 | } |
364 | return true; |
365 | } |
366 | }; |
367 | |
368 | template <template <typename> class Atom> |
369 | Getcpu::Func AccessSpreader<Atom>::getcpuFunc = |
370 | AccessSpreader<Atom>::degenerateGetcpu; |
371 | |
372 | template <template <typename> class Atom> |
373 | typename AccessSpreader<Atom>::CompactStripe |
374 | AccessSpreader<Atom>::widthAndCpuToStripe[kMaxCpus + 1][kMaxCpus] = {}; |
375 | |
376 | #ifdef FOLLY_TLS |
377 | template <template <typename> class Atom> |
378 | FOLLY_TLS |
379 | typename AccessSpreader<Atom>::CpuCache AccessSpreader<Atom>::cpuCache; |
380 | #endif |
381 | |
382 | template <template <typename> class Atom> |
383 | bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize(); |
384 | |
385 | // Suppress this instantiation in other translation units. It is |
386 | // instantiated in CacheLocality.cpp |
387 | extern template struct AccessSpreader<std::atomic>; |
388 | |
389 | /** |
390 | * A simple freelist allocator. Allocates things of size sz, from |
391 | * slabs of size allocSize. Takes a lock on each |
392 | * allocation/deallocation. |
393 | */ |
394 | class SimpleAllocator { |
395 | std::mutex m_; |
396 | uint8_t* mem_{nullptr}; |
397 | uint8_t* end_{nullptr}; |
398 | void* freelist_{nullptr}; |
399 | size_t allocSize_; |
400 | size_t sz_; |
401 | std::vector<void*> blocks_; |
402 | |
403 | public: |
404 | SimpleAllocator(size_t allocSize, size_t sz); |
405 | ~SimpleAllocator(); |
406 | void* allocateHard(); |
407 | |
408 | // Inline fast-paths. |
409 | void* allocate() { |
410 | std::lock_guard<std::mutex> g(m_); |
411 | // Freelist allocation. |
412 | if (freelist_) { |
413 | auto mem = freelist_; |
414 | freelist_ = *static_cast<void**>(freelist_); |
415 | return mem; |
416 | } |
417 | |
418 | // Bump-ptr allocation. |
419 | if (intptr_t(mem_) % 128 == 0) { |
420 | // Avoid allocating pointers that may look like malloc |
421 | // pointers. |
422 | mem_ += std::min(sz_, max_align_v); |
423 | } |
424 | if (mem_ && (mem_ + sz_ <= end_)) { |
425 | auto mem = mem_; |
426 | mem_ += sz_; |
427 | |
428 | assert(intptr_t(mem) % 128 != 0); |
429 | return mem; |
430 | } |
431 | |
432 | return allocateHard(); |
433 | } |
434 | void deallocate(void* mem) { |
435 | std::lock_guard<std::mutex> g(m_); |
436 | *static_cast<void**>(mem) = freelist_; |
437 | freelist_ = mem; |
438 | } |
439 | }; |
440 | |
441 | /** |
442 | * An allocator that can be used with CacheLocality to allocate |
443 | * core-local memory. |
444 | * |
445 | * There is actually nothing special about the memory itself (it is |
446 | * not bound to numa nodes or anything), but the allocator guarantees |
447 | * that memory allocatd from the same stripe will only come from cache |
448 | * lines also allocated to the same stripe. This means multiple |
449 | * things using CacheLocality can allocate memory in smaller-than |
450 | * cacheline increments, and be assured that it won't cause more false |
451 | * sharing than it otherwise would. |
452 | * |
453 | * Note that allocation and deallocation takes a per-sizeclass lock. |
454 | */ |
455 | template <size_t Stripes> |
456 | class CoreRawAllocator { |
457 | public: |
458 | class Allocator { |
459 | static constexpr size_t AllocSize{4096}; |
460 | |
461 | uint8_t sizeClass(size_t size) { |
462 | if (size <= 8) { |
463 | return 0; |
464 | } else if (size <= 16) { |
465 | return 1; |
466 | } else if (size <= 32) { |
467 | return 2; |
468 | } else if (size <= 64) { |
469 | return 3; |
470 | } else { // punt to malloc. |
471 | return 4; |
472 | } |
473 | } |
474 | |
475 | std::array<SimpleAllocator, 4> allocators_{ |
476 | {{AllocSize, 8}, {AllocSize, 16}, {AllocSize, 32}, {AllocSize, 64}}}; |
477 | |
478 | public: |
479 | void* allocate(size_t size) { |
480 | auto cl = sizeClass(size); |
481 | if (cl == 4) { |
482 | // Align to a cacheline |
483 | size = size + (hardware_destructive_interference_size - 1); |
484 | size &= ~size_t(hardware_destructive_interference_size - 1); |
485 | void* mem = |
486 | aligned_malloc(size, hardware_destructive_interference_size); |
487 | if (!mem) { |
488 | throw_exception<std::bad_alloc>(); |
489 | } |
490 | return mem; |
491 | } |
492 | return allocators_[cl].allocate(); |
493 | } |
494 | void deallocate(void* mem, size_t = 0) { |
495 | if (!mem) { |
496 | return; |
497 | } |
498 | |
499 | // See if it came from this allocator or malloc. |
500 | if (intptr_t(mem) % 128 != 0) { |
501 | auto addr = |
502 | reinterpret_cast<void*>(intptr_t(mem) & ~intptr_t(AllocSize - 1)); |
503 | auto allocator = *static_cast<SimpleAllocator**>(addr); |
504 | allocator->deallocate(mem); |
505 | } else { |
506 | aligned_free(mem); |
507 | } |
508 | } |
509 | }; |
510 | |
511 | Allocator* get(size_t stripe) { |
512 | assert(stripe < Stripes); |
513 | return &allocators_[stripe]; |
514 | } |
515 | |
516 | private: |
517 | Allocator allocators_[Stripes]; |
518 | }; |
519 | |
520 | template <typename T, size_t Stripes> |
521 | CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator> |
522 | getCoreAllocator(size_t stripe) { |
523 | // We cannot make sure that the allocator will be destroyed after |
524 | // all the objects allocated with it, so we leak it. |
525 | static Indestructible<CoreRawAllocator<Stripes>> allocator; |
526 | return CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator>( |
527 | *allocator->get(stripe)); |
528 | } |
529 | |
530 | } // namespace folly |
531 | |