CacheLocality.h source code [folly/concurrency/CacheLocality.h]

1	/*
2	* Copyright 2013-present Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16
17	#pragma once
18
19	#include <algorithm>
20	#include <array>
21	#include <atomic>
22	#include <cassert>
23	#include <functional>
24	#include <limits>
25	#include <mutex>
26	#include <string>
27	#include <type_traits>
28	#include <unordered_map>
29	#include <vector>
30
31	#include <folly/Indestructible.h>
32	#include <folly/Likely.h>
33	#include <folly/Memory.h>
34	#include <folly/Portability.h>
35	#include <folly/hash/Hash.h>
36	#include <folly/lang/Align.h>
37	#include <folly/lang/Exception.h>
38	#include <folly/system/ThreadId.h>
39
40	namespace folly {
41
42	// This file contains several classes that might be useful if you are
43	// trying to dynamically optimize cache locality: CacheLocality reads
44	// cache sharing information from sysfs to determine how CPUs should be
45	// grouped to minimize contention, Getcpu provides fast access to the
46	// current CPU via __vdso_getcpu, and AccessSpreader uses these two to
47	// optimally spread accesses among a predetermined number of stripes.
48	//
49	// AccessSpreader<>::current(n) microbenchmarks at 22 nanos, which is
50	// substantially less than the cost of a cache miss. This means that we
51	// can effectively use it to reduce cache line ping-pong on striped data
52	// structures such as IndexedMemPool or statistics counters.
53	//
54	// Because CacheLocality looks at all of the cache levels, it can be
55	// used for different levels of optimization. AccessSpreader(2) does
56	// per-chip spreading on a dual socket system. AccessSpreader(numCpus)
57	// does perfect per-cpu spreading. AccessSpreader(numCpus / 2) does
58	// perfect L1 spreading in a system with hyperthreading enabled.
59
60	struct CacheLocality {
61	/// 1 more than the maximum value that can be returned from sched_getcpu
62	/// or getcpu. This is the number of hardware thread contexts provided
63	/// by the processors
64	size_t numCpus;
65
66	/// Holds the number of caches present at each cache level (0 is
67	/// the closest to the cpu). This is the number of AccessSpreader
68	/// stripes needed to avoid cross-cache communication at the specified
69	/// layer. numCachesByLevel.front() is the number of L1 caches and
70	/// numCachesByLevel.back() is the number of last-level caches.
71	std::vector<size_t> numCachesByLevel;
72
73	/// A map from cpu (from sched_getcpu or getcpu) to an index in the
74	/// range 0..numCpus-1, where neighboring locality indices are more
75	/// likely to share caches then indices far away. All of the members
76	/// of a particular cache level be contiguous in their locality index.
77	/// For example, if numCpus is 32 and numCachesByLevel.back() is 2,
78	/// then cpus with a locality index < 16 will share one last-level
79	/// cache and cpus with a locality index >= 16 will share the other.
80	std::vector<size_t> localityIndexByCpu;
81
82	/// Returns the best CacheLocality information available for the current
83	/// system, cached for fast access. This will be loaded from sysfs if
84	/// possible, otherwise it will be correct in the number of CPUs but
85	/// not in their sharing structure.
86	///
87	/// If you are into yo dawgs, this is a shared cache of the local
88	/// locality of the shared caches.
89	///
90	/// The template parameter here is used to allow injection of a
91	/// repeatable CacheLocality structure during testing. Rather than
92	/// inject the type of the CacheLocality provider into every data type
93	/// that transitively uses it, all components select between the default
94	/// sysfs implementation and a deterministic implementation by keying
95	/// off the type of the underlying atomic. See DeterministicScheduler.
96	template <template <typename> class Atom = std::atomic>
97	static const CacheLocality& system();
98
99	/// Reads CacheLocality information from a tree structured like
100	/// the sysfs filesystem. The provided function will be evaluated
101	/// for each sysfs file that needs to be queried. The function
102	/// should return a string containing the first line of the file
103	/// (not including the newline), or an empty string if the file does
104	/// not exist. The function will be called with paths of the form
105	/// /sys/devices/system/cpu/cpu/cache/index/{type,shared_cpu_list} .
106	/// Throws an exception if no caches can be parsed at all.
107	static CacheLocality readFromSysfsTree(
108	const std::function<std::string(std::string)>& mapping);
109
110	/// Reads CacheLocality information from the real sysfs filesystem.
111	/// Throws an exception if no cache information can be loaded.
112	static CacheLocality readFromSysfs();
113
114	/// Returns a usable (but probably not reflective of reality)
115	/// CacheLocality structure with the specified number of cpus and a
116	/// single cache level that associates one cpu per cache.
117	static CacheLocality uniform(size_t numCpus);
118	};
119
120	/// Knows how to derive a function pointer to the VDSO implementation of
121	/// getcpu(2), if available
122	struct Getcpu {
123	/// Function pointer to a function with the same signature as getcpu(2).
124	typedef int (Func)(unsigned** cpu, unsigned* node, void* unused);
125
126	/// Returns a pointer to the VDSO implementation of getcpu(2), if
127	/// available, or nullptr otherwise. This function may be quite
128	/// expensive, be sure to cache the result.
129	static Func resolveVdsoFunc();
130	};
131
132	#ifdef FOLLY_TLS
133	template <template <typename> class Atom>
134	struct SequentialThreadId {
135	/// Returns the thread id assigned to the current thread
136	static unsigned get() {
137	auto rv = currentId;
138	if (UNLIKELY(rv == `0`)) {
139	rv = currentId = ++prevId;
140	}
141	return rv;
142	}
143
144	private:
145	static Atom<unsigned> prevId;
146
147	static FOLLY_TLS unsigned currentId;
148	};
149
150	template <template <typename> class Atom>
151	Atom<unsigned> SequentialThreadId<Atom>::prevId(`0`);
152
153	template <template <typename> class Atom>
154	FOLLY_TLS unsigned SequentialThreadId<Atom>::currentId(`0`);
155
156	// Suppress this instantiation in other translation units. It is
157	// instantiated in CacheLocality.cpp
158	extern template struct SequentialThreadId<std::atomic>;
159	#endif
160
161	struct HashingThreadId {
162	static unsigned get() {
163	return hash::twang_32from64(getCurrentThreadID());
164	}
165	};
166
167	/// A class that lazily binds a unique (for each implementation of Atom)
168	/// identifier to a thread. This is a fallback mechanism for the access
169	/// spreader if __vdso_getcpu can't be loaded
170	template <typename ThreadId>
171	struct FallbackGetcpu {
172	/// Fills the thread id into the cpu and node out params (if they
173	/// are non-null). This method is intended to act like getcpu when a
174	/// fast-enough form of getcpu isn't available or isn't desired
175	static int getcpu(unsigned* cpu, unsigned* node, void* / unused /) {
176	auto id = ThreadId::get();
177	if (cpu) {
178	*cpu = id;
179	}
180	if (node) {
181	*node = id;
182	}
183	return `0`;
184	}
185	};
186
187	#ifdef FOLLY_TLS
188	typedef FallbackGetcpu<SequentialThreadId<std::atomic>> FallbackGetcpuType;
189	#else
190	typedef FallbackGetcpu<HashingThreadId> FallbackGetcpuType;
191	#endif
192
193	/// AccessSpreader arranges access to a striped data structure in such a
194	/// way that concurrently executing threads are likely to be accessing
195	/// different stripes. It does NOT guarantee uncontended access.
196	/// Your underlying algorithm must be thread-safe without spreading, this
197	/// is merely an optimization. AccessSpreader::current(n) is typically
198	/// much faster than a cache miss (12 nanos on my dev box, tested fast
199	/// in both 2.6 and 3.2 kernels).
200	///
201	/// If available (and not using the deterministic testing implementation)
202	/// AccessSpreader uses the getcpu system call via VDSO and the
203	/// precise locality information retrieved from sysfs by CacheLocality.
204	/// This provides optimal anti-sharing at a fraction of the cost of a
205	/// cache miss.
206	///
207	/// When there are not as many stripes as processors, we try to optimally
208	/// place the cache sharing boundaries. This means that if you have 2
209	/// stripes and run on a dual-socket system, your 2 stripes will each get
210	/// all of the cores from a single socket. If you have 16 stripes on a
211	/// 16 core system plus hyperthreading (32 cpus), each core will get its
212	/// own stripe and there will be no cache sharing at all.
213	///
214	/// AccessSpreader has a fallback mechanism for when __vdso_getcpu can't be
215	/// loaded, or for use during deterministic testing. Using sched_getcpu
216	/// or the getcpu syscall would negate the performance advantages of
217	/// access spreading, so we use a thread-local value and a shared atomic
218	/// counter to spread access out. On systems lacking both a fast getcpu()
219	/// and TLS, we hash the thread id to spread accesses.
220	///
221	/// AccessSpreader is templated on the template type that is used
222	/// to implement atomics, as a way to instantiate the underlying
223	/// heuristics differently for production use and deterministic unit
224	/// testing. See DeterministicScheduler for more. If you aren't using
225	/// DeterministicScheduler, you can just use the default template parameter
226	/// all of the time.
227	template <template <typename> class Atom = std::atomic>
228	struct AccessSpreader {
229	/// Returns the stripe associated with the current CPU. The returned
230	/// value will be < numStripes.
231	static size_t current(size_t numStripes) {
232	// widthAndCpuToStripe[0] will actually work okay (all zeros), but
233	// something's wrong with the caller
234	assert(numStripes > `0`);
235
236	unsigned cpu;
237	getcpuFunc(&cpu, nullptr, nullptr);
238	return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
239	[cpu % kMaxCpus];
240	}
241
242	#ifdef FOLLY_TLS
243	/// Returns the stripe associated with the current CPU. The returned
244	/// value will be < numStripes.
245	/// This function caches the current cpu in a thread-local variable for a
246	/// certain small number of calls, which can make the result imprecise, but
247	/// it is more efficient (amortized 2 ns on my dev box, compared to 12 ns for
248	/// current()).
249	static size_t cachedCurrent(size_t numStripes) {
250	return widthAndCpuToStripe[std::min(size_t(kMaxCpus), numStripes)]
251	[cpuCache.cpu()];
252	}
253	#else
254	/// Fallback implementation when thread-local storage isn't available.
255	static size_t cachedCurrent(size_t numStripes) {
256	return current(numStripes);
257	}
258	#endif
259
260	private:
261	/// If there are more cpus than this nothing will crash, but there
262	/// might be unnecessary sharing
263	enum { kMaxCpus = `128` };
264
265	typedef uint8_t CompactStripe;
266
267	static_assert(
268	(kMaxCpus & (kMaxCpus - `1`)) == `0`,
269	"kMaxCpus should be a power of two so modulo is fast");
270	static_assert(
271	kMaxCpus - `1` <= std::numeric_limits<CompactStripe>::max(),
272	"stripeByCpu element type isn't wide enough");
273
274	/// Points to the getcpu-like function we are using to obtain the
275	/// current cpu. It should not be assumed that the returned cpu value
276	/// is in range. We use a static for this so that we can prearrange a
277	/// valid value in the pre-constructed state and avoid the need for a
278	/// conditional on every subsequent invocation (not normally a big win,
279	/// but 20% on some inner loops here).
280	static Getcpu::Func getcpuFunc;
281
282	/// For each level of splitting up to kMaxCpus, maps the cpu (mod
283	/// kMaxCpus) to the stripe. Rather than performing any inequalities
284	/// or modulo on the actual number of cpus, we just fill in the entire
285	/// array.
286	static CompactStripe widthAndCpuToStripe[kMaxCpus + `1`][kMaxCpus];
287
288	/// Caches the current CPU and refreshes the cache every so often.
289	class CpuCache {
290	public:
291	unsigned cpu() {
292	if (UNLIKELY(cachedCpuUses_-- == `0`)) {
293	unsigned cpu;
294	AccessSpreader::getcpuFunc(&cpu, nullptr, nullptr);
295	cachedCpu_ = cpu % kMaxCpus;
296	cachedCpuUses_ = kMaxCachedCpuUses - `1`;
297	}
298	return cachedCpu_;
299	}
300
301	private:
302	static constexpr unsigned kMaxCachedCpuUses = `32`;
303
304	unsigned cachedCpu_{`0`};
305	unsigned cachedCpuUses_{`0`};
306	};
307
308	#ifdef FOLLY_TLS
309	static FOLLY_TLS CpuCache cpuCache;
310	#endif
311
312	static bool initialized;
313
314	/// Returns the best getcpu implementation for Atom
315	static Getcpu::Func pickGetcpuFunc() {
316	auto best = Getcpu::resolveVdsoFunc();
317	return best ? best : &FallbackGetcpuType::getcpu;
318	}
319
320	/// Always claims to be on CPU zero, node zero
321	static int degenerateGetcpu(unsigned* cpu, unsigned* node, void*) {
322	if (cpu != nullptr) {
323	*cpu = `0`;
324	}
325	if (node != nullptr) {
326	*node = `0`;
327	}
328	return `0`;
329	}
330
331	// The function to call for fast lookup of getcpu is a singleton, as
332	// is the precomputed table of locality information. AccessSpreader
333	// is used in very tight loops, however (we're trying to race an L1
334	// cache miss!), so the normal singleton mechanisms are noticeably
335	// expensive. Even a not-taken branch guarding access to getcpuFunc
336	// slows AccessSpreader::current from 12 nanos to 14. As a result, we
337	// populate the static members with simple (but valid) values that can
338	// be filled in by the linker, and then follow up with a normal static
339	// initializer call that puts in the proper version. This means that
340	// when there are initialization order issues we will just observe a
341	// zero stripe. Once a sanitizer gets smart enough to detect this as
342	// a race or undefined behavior, we can annotate it.
343
344	static bool initialize() {
345	getcpuFunc = pickGetcpuFunc();
346
347	auto& cacheLocality = CacheLocality::system<Atom>();
348	auto n = cacheLocality.numCpus;
349	for (size_t width = `0`; width <= kMaxCpus; ++width) {
350	auto numStripes = std::max(size_t{`1`}, width);
351	for (size_t cpu = `0`; cpu < kMaxCpus && cpu < n; ++cpu) {
352	auto index = cacheLocality.localityIndexByCpu[cpu];
353	assert(index < n);
354	// as index goes from 0..n, post-transform value goes from
355	// 0..numStripes
356	widthAndCpuToStripe[width][cpu] =
357	CompactStripe((index * numStripes) / n);
358	assert(widthAndCpuToStripe[width][cpu] < numStripes);
359	}
360	for (size_t cpu = n; cpu < kMaxCpus; ++cpu) {
361	widthAndCpuToStripe[width][cpu] = widthAndCpuToStripe[width][cpu - n];
362	}
363	}
364	return true;
365	}
366	};
367
368	template <template <typename> class Atom>
369	Getcpu::Func AccessSpreader<Atom>::getcpuFunc =
370	AccessSpreader<Atom>::degenerateGetcpu;
371
372	template <template <typename> class Atom>
373	typename AccessSpreader<Atom>::CompactStripe
374	AccessSpreader<Atom>::widthAndCpuToStripe[kMaxCpus + `1`][kMaxCpus] = {};
375
376	#ifdef FOLLY_TLS
377	template <template <typename> class Atom>
378	FOLLY_TLS
379	typename AccessSpreader<Atom>::CpuCache AccessSpreader<Atom>::cpuCache;
380	#endif
381
382	template <template <typename> class Atom>
383	bool AccessSpreader<Atom>::initialized = AccessSpreader<Atom>::initialize();
384
385	// Suppress this instantiation in other translation units. It is
386	// instantiated in CacheLocality.cpp
387	extern template struct AccessSpreader<std::atomic>;
388
389	/**
390	* A simple freelist allocator. Allocates things of size sz, from
391	* slabs of size allocSize. Takes a lock on each
392	* allocation/deallocation.
393	*/
394	class SimpleAllocator {
395	std::mutex m_;
396	uint8_t* mem_{nullptr};
397	uint8_t* end_{nullptr};
398	void* freelist_{nullptr};
399	size_t allocSize_;
400	size_t sz_;
401	std::vector<void*> blocks_;
402
403	public:
404	SimpleAllocator(size_t allocSize, size_t sz);
405	~SimpleAllocator();
406	void* allocateHard();
407
408	// Inline fast-paths.
409	void* allocate() {
410	std::lock_guard<std::mutex> g(m_);
411	// Freelist allocation.
412	if (freelist_) {
413	auto mem = freelist_;
414	freelist_ = *static_cast<void**>(freelist_);
415	return mem;
416	}
417
418	// Bump-ptr allocation.
419	if (intptr_t(mem_) % `128` == `0`) {
420	// Avoid allocating pointers that may look like malloc
421	// pointers.
422	mem_ += std::min(sz_, max_align_v);
423	}
424	if (mem_ && (mem_ + sz_ <= end_)) {
425	auto mem = mem_;
426	mem_ += sz_;
427
428	assert(intptr_t(mem) % `128` != `0`);
429	return mem;
430	}
431
432	return allocateHard();
433	}
434	void deallocate(void* mem) {
435	std::lock_guard<std::mutex> g(m_);
436	*static_cast<void**>(mem) = freelist_;
437	freelist_ = mem;
438	}
439	};
440
441	/**
442	* An allocator that can be used with CacheLocality to allocate
443	* core-local memory.
444	*
445	* There is actually nothing special about the memory itself (it is
446	* not bound to numa nodes or anything), but the allocator guarantees
447	* that memory allocatd from the same stripe will only come from cache
448	* lines also allocated to the same stripe. This means multiple
449	* things using CacheLocality can allocate memory in smaller-than
450	* cacheline increments, and be assured that it won't cause more false
451	* sharing than it otherwise would.
452	*
453	* Note that allocation and deallocation takes a per-sizeclass lock.
454	*/
455	template <size_t Stripes>
456	class CoreRawAllocator {
457	public:
458	class Allocator {
459	static constexpr size_t AllocSize{`4096`};
460
461	uint8_t sizeClass(size_t size) {
462	if (size <= `8`) {
463	return `0`;
464	} else if (size <= `16`) {
465	return `1`;
466	} else if (size <= `32`) {
467	return `2`;
468	} else if (size <= `64`) {
469	return `3`;
470	} else { // punt to malloc.
471	return `4`;
472	}
473	}
474
475	std::array<SimpleAllocator, `4`> allocators_{
476	{{AllocSize, `8`}, {AllocSize, `16`}, {AllocSize, `32`}, {AllocSize, `64`}}};
477
478	public:
479	void* allocate(size_t size) {
480	auto cl = sizeClass(size);
481	if (cl == `4`) {
482	// Align to a cacheline
483	size = size + (hardware_destructive_interference_size - `1`);
484	size &= ~size_t(hardware_destructive_interference_size - `1`);
485	void* mem =
486	aligned_malloc(size, hardware_destructive_interference_size);
487	if (!mem) {
488	throw_exception<std::bad_alloc>();
489	}
490	return mem;
491	}
492	return allocators_[cl].allocate();
493	}
494	void deallocate(void* mem, size_t = `0`) {
495	if (!mem) {
496	return;
497	}
498
499	// See if it came from this allocator or malloc.
500	if (intptr_t(mem) % `128` != `0`) {
501	auto addr =
502	reinterpret_cast<void*>(intptr_t(mem) & ~intptr_t(AllocSize - `1`));
503	auto allocator = *static_cast<SimpleAllocator**>(addr);
504	allocator->deallocate(mem);
505	} else {
506	aligned_free(mem);
507	}
508	}
509	};
510
511	Allocator* get(size_t stripe) {
512	assert(stripe < Stripes);
513	return &allocators_[stripe];
514	}
515
516	private:
517	Allocator allocators_[Stripes];
518	};
519
520	template <typename T, size_t Stripes>
521	CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator>
522	getCoreAllocator(size_t stripe) {
523	// We cannot make sure that the allocator will be destroyed after
524	// all the objects allocated with it, so we leak it.
525	static Indestructible<CoreRawAllocator<Stripes>> allocator;
526	return CxxAllocatorAdaptor<T, typename CoreRawAllocator<Stripes>::Allocator>(
527	*allocator->get(stripe));
528	}
529
530	} // namespace folly
531

Browse the source code of folly/concurrency/CacheLocality.h